summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /net/sched
parentInitial commit. (diff)
downloadlinux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig927
-rw-r--r--net/sched/Makefile81
-rw-r--r--net/sched/act_api.c1739
-rw-r--r--net/sched/act_bpf.c446
-rw-r--r--net/sched/act_connmark.c250
-rw-r--r--net/sched/act_csum.c737
-rw-r--r--net/sched/act_gact.c302
-rw-r--r--net/sched/act_ife.c920
-rw-r--r--net/sched/act_ipt.c452
-rw-r--r--net/sched/act_meta_mark.c78
-rw-r--r--net/sched/act_meta_skbprio.c76
-rw-r--r--net/sched/act_meta_skbtcindex.c78
-rw-r--r--net/sched/act_mirred.c462
-rw-r--r--net/sched/act_nat.c349
-rw-r--r--net/sched/act_pedit.c541
-rw-r--r--net/sched/act_police.c371
-rw-r--r--net/sched/act_sample.c292
-rw-r--r--net/sched/act_simple.c251
-rw-r--r--net/sched/act_skbedit.c349
-rw-r--r--net/sched/act_skbmod.c314
-rw-r--r--net/sched/act_tunnel_key.c616
-rw-r--r--net/sched/act_vlan.c357
-rw-r--r--net/sched/cls_api.c2320
-rw-r--r--net/sched/cls_basic.c331
-rw-r--r--net/sched/cls_bpf.c722
-rw-r--r--net/sched/cls_cgroup.c221
-rw-r--r--net/sched/cls_flow.c725
-rw-r--r--net/sched/cls_flower.c2001
-rw-r--r--net/sched/cls_fw.c474
-rw-r--r--net/sched/cls_matchall.c357
-rw-r--r--net/sched/cls_route.c687
-rw-r--r--net/sched/cls_rsvp.c28
-rw-r--r--net/sched/cls_rsvp.h775
-rw-r--r--net/sched/cls_rsvp6.c28
-rw-r--r--net/sched/cls_tcindex.c698
-rw-r--r--net/sched/cls_u32.c1507
-rw-r--r--net/sched/em_canid.c233
-rw-r--r--net/sched/em_cmp.c99
-rw-r--r--net/sched/em_ipset.c137
-rw-r--r--net/sched/em_ipt.c257
-rw-r--r--net/sched/em_meta.c1014
-rw-r--r--net/sched/em_nbyte.c80
-rw-r--r--net/sched/em_text.c157
-rw-r--r--net/sched/em_u32.c64
-rw-r--r--net/sched/ematch.c552
-rw-r--r--net/sched/sch_api.c2209
-rw-r--r--net/sched/sch_atm.c705
-rw-r--r--net/sched/sch_blackhole.c45
-rw-r--r--net/sched/sch_cake.c3058
-rw-r--r--net/sched/sch_cbq.c1823
-rw-r--r--net/sched/sch_cbs.c588
-rw-r--r--net/sched/sch_choke.c528
-rw-r--r--net/sched/sch_codel.c309
-rw-r--r--net/sched/sch_drr.c514
-rw-r--r--net/sched/sch_dsmark.c519
-rw-r--r--net/sched/sch_etf.c485
-rw-r--r--net/sched/sch_fifo.c190
-rw-r--r--net/sched/sch_fq.c936
-rw-r--r--net/sched/sch_fq_codel.c744
-rw-r--r--net/sched/sch_generic.c1447
-rw-r--r--net/sched/sch_gred.c620
-rw-r--r--net/sched/sch_hfsc.c1697
-rw-r--r--net/sched/sch_hhf.c721
-rw-r--r--net/sched/sch_htb.c1621
-rw-r--r--net/sched/sch_ingress.c300
-rw-r--r--net/sched/sch_mq.c314
-rw-r--r--net/sched/sch_mqprio.c675
-rw-r--r--net/sched/sch_multiq.c425
-rw-r--r--net/sched/sch_netem.c1275
-rw-r--r--net/sched/sch_pie.c572
-rw-r--r--net/sched/sch_plug.c235
-rw-r--r--net/sched/sch_prio.c484
-rw-r--r--net/sched/sch_qfq.c1532
-rw-r--r--net/sched/sch_red.c455
-rw-r--r--net/sched/sch_sfb.c734
-rw-r--r--net/sched/sch_sfq.c944
-rw-r--r--net/sched/sch_skbprio.c323
-rw-r--r--net/sched/sch_tbf.c563
-rw-r--r--net/sched/sch_teql.c530
79 files changed, 51575 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
new file mode 100644
index 000000000..e95741388
--- /dev/null
+++ b/net/sched/Kconfig
@@ -0,0 +1,927 @@
+#
+# Traffic control configuration.
+#
+
+menuconfig NET_SCHED
+ bool "QoS and/or fair queueing"
+ select NET_SCH_FIFO
+ ---help---
+ When the kernel has several packets to send out over a network
+ device, it has to decide which ones to send first, which ones to
+ delay, and which ones to drop. This is the job of the queueing
+ disciplines, several different algorithms for how to do this
+ "fairly" have been proposed.
+
+ If you say N here, you will get the standard packet scheduler, which
+ is a FIFO (first come, first served). If you say Y here, you will be
+ able to choose from among several alternative algorithms which can
+ then be attached to different network devices. This is useful for
+ example if some of your network devices are real time devices that
+ need a certain minimum data flow rate, or if you need to limit the
+ maximum data flow rate for traffic which matches specified criteria.
+ This code is considered to be experimental.
+
+ To administer these schedulers, you'll need the user-level utilities
+ from the package iproute2+tc at
+ <https://www.kernel.org/pub/linux/utils/net/iproute2/>. That package
+ also contains some documentation; for more, check out
+ <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
+
+ This Quality of Service (QoS) support will enable you to use
+ Differentiated Services (diffserv) and Resource Reservation Protocol
+ (RSVP) on your Linux router if you also say Y to the corresponding
+ classifiers below. Documentation and software is at
+ <http://diffserv.sourceforge.net/>.
+
+ If you say Y here and to "/proc file system" below, you will be able
+ to read status information about packet schedulers from the file
+ /proc/net/psched.
+
+ The available schedulers are listed in the following questions; you
+ can say Y to as many as you like. If unsure, say N now.
+
+if NET_SCHED
+
+comment "Queueing/Scheduling"
+
+config NET_SCH_CBQ
+ tristate "Class Based Queueing (CBQ)"
+ ---help---
+ Say Y here if you want to use the Class-Based Queueing (CBQ) packet
+ scheduling algorithm. This algorithm classifies the waiting packets
+ into a tree-like hierarchy of classes; the leaves of this tree are
+ in turn scheduled by separate algorithms.
+
+ See the top of <file:net/sched/sch_cbq.c> for more details.
+
+ CBQ is a commonly used scheduler, so if you're unsure, you should
+ say Y here. Then say Y to all the queueing algorithms below that you
+ want to use as leaf disciplines.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_cbq.
+
+config NET_SCH_HTB
+ tristate "Hierarchical Token Bucket (HTB)"
+ ---help---
+ Say Y here if you want to use the Hierarchical Token Buckets (HTB)
+ packet scheduling algorithm. See
+ <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
+ in-depth articles.
+
+ HTB is very similar to CBQ regarding its goals however is has
+ different properties and different algorithm.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_htb.
+
+config NET_SCH_HFSC
+ tristate "Hierarchical Fair Service Curve (HFSC)"
+ ---help---
+ Say Y here if you want to use the Hierarchical Fair Service Curve
+ (HFSC) packet scheduling algorithm.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_hfsc.
+
+config NET_SCH_ATM
+ tristate "ATM Virtual Circuits (ATM)"
+ depends on ATM
+ ---help---
+ Say Y here if you want to use the ATM pseudo-scheduler. This
+ provides a framework for invoking classifiers, which in turn
+ select classes of this queuing discipline. Each class maps
+ the flow(s) it is handling to a given virtual circuit.
+
+ See the top of <file:net/sched/sch_atm.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_atm.
+
+config NET_SCH_PRIO
+ tristate "Multi Band Priority Queueing (PRIO)"
+ ---help---
+ Say Y here if you want to use an n-band priority queue packet
+ scheduler.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_prio.
+
+config NET_SCH_MULTIQ
+ tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+ ---help---
+ Say Y here if you want to use an n-band queue packet scheduler
+ to support devices that have multiple hardware transmit queues.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_multiq.
+
+config NET_SCH_RED
+ tristate "Random Early Detection (RED)"
+ ---help---
+ Say Y here if you want to use the Random Early Detection (RED)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_red.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_red.
+
+config NET_SCH_SFB
+ tristate "Stochastic Fair Blue (SFB)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fair Blue (SFB)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfb.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfb.
+
+config NET_SCH_SFQ
+ tristate "Stochastic Fairness Queueing (SFQ)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfq.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfq.
+
+config NET_SCH_TEQL
+ tristate "True Link Equalizer (TEQL)"
+ ---help---
+ Say Y here if you want to use the True Link Equalizer (TLE) packet
+ scheduling algorithm. This queueing discipline allows the combination
+ of several physical devices into one virtual device.
+
+ See the top of <file:net/sched/sch_teql.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_teql.
+
+config NET_SCH_TBF
+ tristate "Token Bucket Filter (TBF)"
+ ---help---
+ Say Y here if you want to use the Token Bucket Filter (TBF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_tbf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_tbf.
+
+config NET_SCH_CBS
+ tristate "Credit Based Shaper (CBS)"
+ ---help---
+ Say Y here if you want to use the Credit Based Shaper (CBS) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_cbs.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_cbs.
+
+config NET_SCH_ETF
+ tristate "Earliest TxTime First (ETF)"
+ help
+ Say Y here if you want to use the Earliest TxTime First (ETF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_etf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_etf.
+
+config NET_SCH_GRED
+ tristate "Generic Random Early Detection (GRED)"
+ ---help---
+ Say Y here if you want to use the Generic Random Early Detection
+ (GRED) packet scheduling algorithm for some of your network devices
+ (see the top of <file:net/sched/sch_red.c> for details and
+ references about the algorithm).
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_gred.
+
+config NET_SCH_DSMARK
+ tristate "Differentiated Services marker (DSMARK)"
+ ---help---
+ Say Y if you want to schedule packets according to the
+ Differentiated Services architecture proposed in RFC 2475.
+ Technical information on this method, with pointers to associated
+ RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_dsmark.
+
+config NET_SCH_NETEM
+ tristate "Network emulator (NETEM)"
+ ---help---
+ Say Y if you want to emulate network delay, loss, and packet
+ re-ordering. This is often useful to simulate networks when
+ testing applications or protocols.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_netem.
+
+ If unsure, say N.
+
+config NET_SCH_DRR
+ tristate "Deficit Round Robin scheduler (DRR)"
+ help
+ Say Y here if you want to use the Deficit Round Robin (DRR) packet
+ scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_drr.
+
+ If unsure, say N.
+
+config NET_SCH_MQPRIO
+ tristate "Multi-queue priority scheduler (MQPRIO)"
+ help
+ Say Y here if you want to use the Multi-queue Priority scheduler.
+ This scheduler allows QOS to be offloaded on NICs that have support
+ for offloading QOS schedulers.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_mqprio.
+
+ If unsure, say N.
+
+config NET_SCH_SKBPRIO
+ tristate "SKB priority queue scheduler (SKBPRIO)"
+ help
+ Say Y here if you want to use the SKB priority queue
+ scheduler. This schedules packets according to skb->priority,
+ which is useful for request packets in DoS mitigation systems such
+ as Gatekeeper.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_skbprio.
+
+ If unsure, say N.
+
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
+config NET_SCH_QFQ
+ tristate "Quick Fair Queueing scheduler (QFQ)"
+ help
+ Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_qfq.
+
+ If unsure, say N.
+
+config NET_SCH_CODEL
+ tristate "Controlled Delay AQM (CODEL)"
+ help
+ Say Y here if you want to use the Controlled Delay (CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_codel.
+
+ If unsure, say N.
+
+config NET_SCH_FQ_CODEL
+ tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+ help
+ Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_codel.
+
+ If unsure, say N.
+
+config NET_SCH_CAKE
+ tristate "Common Applications Kept Enhanced (CAKE)"
+ help
+ Say Y here if you want to use the Common Applications Kept Enhanced
+ (CAKE) queue management algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_cake.
+
+ If unsure, say N.
+
+config NET_SCH_FQ
+ tristate "Fair Queue"
+ help
+ Say Y here if you want to use the FQ packet scheduling algorithm.
+
+ FQ does flow separation, and is able to respect pacing requirements
+ set by TCP stack into sk->sk_pacing_rate (for localy generated
+ traffic)
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq.
+
+ If unsure, say N.
+
+config NET_SCH_HHF
+ tristate "Heavy-Hitter Filter (HHF)"
+ help
+ Say Y here if you want to use the Heavy-Hitter Filter (HHF)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_hhf.
+
+config NET_SCH_PIE
+ tristate "Proportional Integral controller Enhanced (PIE) scheduler"
+ help
+ Say Y here if you want to use the Proportional Integral controller
+ Enhanced scheduler packet scheduling algorithm.
+ For more information, please see
+ http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_pie.
+
+ If unsure, say N.
+
+config NET_SCH_INGRESS
+ tristate "Ingress/classifier-action Qdisc"
+ depends on NET_CLS_ACT
+ select NET_INGRESS
+ select NET_EGRESS
+ ---help---
+ Say Y here if you want to use classifiers for incoming and/or outgoing
+ packets. This qdisc doesn't do anything else besides running classifiers,
+ which can also have actions attached to them. In case of outgoing packets,
+ classifiers that this qdisc holds are executed in the transmit path
+ before real enqueuing to an egress qdisc happens.
+
+ If unsure, say Y.
+
+ To compile this code as a module, choose M here: the module will be
+ called sch_ingress with alias of sch_clsact.
+
+config NET_SCH_PLUG
+ tristate "Plug network traffic until release (PLUG)"
+ ---help---
+
+ This queuing discipline allows userspace to plug/unplug a network
+ output queue, using the netlink interface. When it receives an
+ enqueue command it inserts a plug into the outbound queue that
+ causes following packets to enqueue until a dequeue command arrives
+ over netlink, causing the plug to be removed and resuming the normal
+ packet flow.
+
+ This module also provides a generic "network output buffering"
+ functionality (aka output commit), wherein upon arrival of a dequeue
+ command, only packets up to the first plug are released for delivery.
+ The Remus HA project uses this module to enable speculative execution
+ of virtual machines by allowing the generated network output to be rolled
+ back if needed.
+
+ For more information, please refer to <http://wiki.xenproject.org/wiki/Remus>
+
+ Say Y here if you are using this kernel for Xen dom0 and
+ want to protect Xen guests with Remus.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_plug.
+
+menuconfig NET_SCH_DEFAULT
+ bool "Allow override default queue discipline"
+ ---help---
+ Support for selection of default queuing discipline.
+
+ Nearly all users can safely say no here, and the default
+ of pfifo_fast will be used. Many distributions already set
+ the default value via /proc/sys/net/core/default_qdisc.
+
+ If unsure, say N.
+
+if NET_SCH_DEFAULT
+
+choice
+ prompt "Default queuing discipline"
+ default DEFAULT_PFIFO_FAST
+ help
+ Select the queueing discipline that will be used by default
+ for all network devices.
+
+ config DEFAULT_FQ
+ bool "Fair Queue" if NET_SCH_FQ
+
+ config DEFAULT_CODEL
+ bool "Controlled Delay" if NET_SCH_CODEL
+
+ config DEFAULT_FQ_CODEL
+ bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL
+
+ config DEFAULT_SFQ
+ bool "Stochastic Fair Queue" if NET_SCH_SFQ
+
+ config DEFAULT_PFIFO_FAST
+ bool "Priority FIFO Fast"
+endchoice
+
+config DEFAULT_NET_SCH
+ string
+ default "pfifo_fast" if DEFAULT_PFIFO_FAST
+ default "fq" if DEFAULT_FQ
+ default "fq_codel" if DEFAULT_FQ_CODEL
+ default "sfq" if DEFAULT_SFQ
+ default "pfifo_fast"
+endif
+
+comment "Classification"
+
+config NET_CLS
+ bool
+
+config NET_CLS_BASIC
+ tristate "Elementary classification (BASIC)"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify packets using
+ only extended matches and actions.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_basic.
+
+config NET_CLS_TCINDEX
+ tristate "Traffic-Control Index (TCINDEX)"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ traffic control indices. You will want this feature if you want
+ to implement Differentiated Services together with DSMARK.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_tcindex.
+
+config NET_CLS_ROUTE4
+ tristate "Routing decision (ROUTE)"
+ depends on INET
+ select IP_ROUTE_CLASSID
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to the route table entry they matched.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_route.
+
+config NET_CLS_FW
+ tristate "Netfilter mark (FW)"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to netfilter/firewall marks.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_fw.
+
+config NET_CLS_U32
+ tristate "Universal 32bit comparisons w/ hashing (U32)"
+ select NET_CLS
+ ---help---
+ Say Y here to be able to classify packets using a universal
+ 32bit pieces based comparison scheme.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_u32.
+
+config CLS_U32_PERF
+ bool "Performance counters support"
+ depends on NET_CLS_U32
+ ---help---
+ Say Y here to make u32 gather additional statistics useful for
+ fine tuning u32 classifiers.
+
+config CLS_U32_MARK
+ bool "Netfilter marks support"
+ depends on NET_CLS_U32
+ ---help---
+ Say Y here to be able to use netfilter marks as u32 key.
+
+config NET_CLS_RSVP
+ tristate "IPv4 Resource Reservation Protocol (RSVP)"
+ select NET_CLS
+ ---help---
+ The Resource Reservation Protocol (RSVP) permits end systems to
+ request a minimum and maximum data flow rate for a connection; this
+ is important for real time data such as streaming sound or video.
+
+ Say Y here if you want to be able to classify outgoing packets based
+ on their RSVP requests.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_rsvp.
+
+config NET_CLS_RSVP6
+ tristate "IPv6 Resource Reservation Protocol (RSVP6)"
+ select NET_CLS
+ ---help---
+ The Resource Reservation Protocol (RSVP) permits end systems to
+ request a minimum and maximum data flow rate for a connection; this
+ is important for real time data such as streaming sound or video.
+
+ Say Y here if you want to be able to classify outgoing packets based
+ on their RSVP requests and you are using the IPv6 protocol.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_rsvp6.
+
+config NET_CLS_FLOW
+ tristate "Flow classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ a configurable combination of packet keys. This is mostly useful
+ in combination with SFQ.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_flow.
+
+config NET_CLS_CGROUP
+ tristate "Control Group Classifier"
+ select NET_CLS
+ select CGROUP_NET_CLASSID
+ depends on CGROUPS
+ ---help---
+ Say Y here if you want to classify packets based on the control
+ cgroup of their process.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_cgroup.
+
+config NET_CLS_BPF
+ tristate "BPF-based classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_bpf.
+
+config NET_CLS_FLOWER
+ tristate "Flower classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ a configurable combination of packet keys and masks.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_flower.
+
+config NET_CLS_MATCHALL
+ tristate "Match-all classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ nothing. Every packet will match.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_matchall.
+
+config NET_EMATCH
+ bool "Extended Matches"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to use extended matches on top of classifiers
+ and select the extended matches below.
+
+ Extended matches are small classification helpers not worth writing
+ a separate classifier for.
+
+ A recent version of the iproute2 package is required to use
+ extended matches.
+
+config NET_EMATCH_STACK
+ int "Stack size"
+ depends on NET_EMATCH
+ default "32"
+ ---help---
+ Size of the local stack variable used while evaluating the tree of
+ ematches. Limits the depth of the tree, i.e. the number of
+ encapsulated precedences. Every level requires 4 bytes of additional
+ stack space.
+
+config NET_EMATCH_CMP
+ tristate "Simple packet data comparison"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ simple packet data comparisons for 8, 16, and 32bit values.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_cmp.
+
+config NET_EMATCH_NBYTE
+ tristate "Multi byte comparison"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ multiple byte comparisons mainly useful for IPv6 address comparisons.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_nbyte.
+
+config NET_EMATCH_U32
+ tristate "U32 key"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets using
+ the famous u32 key in combination with logic relations.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_u32.
+
+config NET_EMATCH_META
+ tristate "Metadata"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ metadata such as load average, netfilter attributes, socket
+ attributes and routing decisions.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_meta.
+
+config NET_EMATCH_TEXT
+ tristate "Textsearch"
+ depends on NET_EMATCH
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ textsearch comparisons.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_text.
+
+config NET_EMATCH_CANID
+ tristate "CAN Identifier"
+ depends on NET_EMATCH && (CAN=y || CAN=m)
+ ---help---
+ Say Y here if you want to be able to classify CAN frames based
+ on CAN Identifier.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_canid.
+
+config NET_EMATCH_IPSET
+ tristate "IPset"
+ depends on NET_EMATCH && IP_SET
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ ipset membership.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_ipset.
+
+config NET_EMATCH_IPT
+ tristate "IPtables Matches"
+ depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES
+ ---help---
+ Say Y here to be able to classify packets based on iptables
+ matches.
+ Current supported match is "policy" which allows packet classification
+ based on IPsec policy that was used during decapsulation
+
+ To compile this code as a module, choose M here: the
+ module will be called em_ipt.
+
+config NET_CLS_ACT
+ bool "Actions"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to use traffic control actions. Actions
+ get attached to classifiers and are invoked after a successful
+ classification. They are used to overwrite the classification
+ result, instantly drop or redirect packets, etc.
+
+ A recent version of the iproute2 package is required to use
+ extended matches.
+
+config NET_ACT_POLICE
+ tristate "Traffic Policing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here if you want to do traffic policing, i.e. strict
+ bandwidth limiting. This action replaces the existing policing
+ module.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_police.
+
+config NET_ACT_GACT
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to take generic actions such as dropping and
+ accepting packets.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_gact.
+
+config GACT_PROB
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ ---help---
+ Say Y here to use the generic action randomly or deterministically.
+
+config NET_ACT_MIRRED
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow packets to be mirrored or redirected to
+ other devices.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mirred.
+
+config NET_ACT_SAMPLE
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ ---help---
+ Say Y here to allow packet sampling tc action. The packet sample
+ action consists of statistically choosing packets and sampling
+ them using the psample module.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_sample.
+
+config NET_ACT_IPT
+ tristate "IPtables targets"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ ---help---
+ Say Y here to be able to invoke iptables targets after successful
+ classification.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ipt.
+
+config NET_ACT_NAT
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to do stateless NAT on IPv4 packets. You should use
+ netfilter for NAT unless you know what you are doing.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_nat.
+
+config NET_ACT_PEDIT
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here if you want to mangle the content of packets.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_pedit.
+
+config NET_ACT_SIMP
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to add a simple action for demonstration purposes.
+ It is meant as an example and for debugging purposes. It will
+ print a configured policy string followed by the packet count
+ to the console for every packet that passes by.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_simple.
+
+config NET_ACT_SKBEDIT
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to change skb priority or queue_mapping settings.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_skbedit.
+
+config NET_ACT_CSUM
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ select LIBCRC32C
+ ---help---
+ Say Y here to update some common checksum after some direct
+ packet alterations.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_csum.
+
+config NET_ACT_VLAN
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to push or pop vlan headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_vlan.
+
+config NET_ACT_BPF
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to execute BPF code on packets. The BPF code will decide
+ if the packet should be dropped or not.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_bpf.
+
+config NET_ACT_CONNMARK
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
+ Say Y here to allow retrieving of conn mark
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_connmark.
+
+config NET_ACT_SKBMOD
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
+
+config NET_ACT_IFE
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ select NET_IFE
+ ---help---
+ Say Y here to allow for sourcing and terminating metadata
+ For details refer to netdev01 paper:
+ "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ife.
+
+config NET_ACT_TUNNEL_KEY
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to set/release ip tunnel metadata.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_tunnel_key.
+
+config NET_IFE_SKBMARK
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_IFE_SKBPRIO
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_IFE_SKBTCINDEX
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_CLS_IND
+ bool "Incoming device classification"
+ depends on NET_CLS_U32 || NET_CLS_FW
+ ---help---
+ Say Y here to extend the u32 and fw classifier to support
+ classification based on the incoming device. This option is
+ likely to disappear in favour of the metadata ematch.
+
+endif # NET_SCHED
+
+config NET_SCH_FIFO
+ bool
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000..f0403f49e
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+
+obj-y := sch_generic.o sch_mq.o
+
+obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
+obj-$(CONFIG_NET_CLS) += cls_api.o
+obj-$(CONFIG_NET_CLS_ACT) += act_api.o
+obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
+obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
+obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o
+obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
+obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
+obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
+obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
+obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
+obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
+obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
+obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
+obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
+obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
+obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
+obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
+obj-$(CONFIG_NET_SCH_RED) += sch_red.o
+obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
+obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
+obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
+obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
+obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
+obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
+obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
+obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
+obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
+obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
+obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
+obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
+obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
+
+obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
+obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
+obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
+obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
+obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
+obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
+obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
+obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
+obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
+obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o
+obj-$(CONFIG_NET_CLS_MATCHALL) += cls_matchall.o
+obj-$(CONFIG_NET_EMATCH) += ematch.o
+obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
+obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
+obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
+obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
+obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o
+obj-$(CONFIG_NET_EMATCH_IPT) += em_ipt.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
new file mode 100644
index 000000000..ad0773b20
--- /dev/null
+++ b/net/sched/act_api.c
@@ -0,0 +1,1739 @@
+/*
+ * net/sched/act_api.c Packet action API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Author: Jamal Hadi Salim
+ *
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/rhashtable.h>
+#include <linux/list.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
+{
+ u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
+
+ if (!tp)
+ return -EINVAL;
+ a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
+ if (!a->goto_chain)
+ return -ENOMEM;
+ return 0;
+}
+
+static void tcf_action_goto_chain_fini(struct tc_action *a)
+{
+ tcf_chain_put_by_act(a->goto_chain);
+}
+
+static void tcf_action_goto_chain_exec(const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct tcf_chain *chain = a->goto_chain;
+
+ res->goto_tp = rcu_dereference_bh(chain->filter_chain);
+}
+
+static void tcf_free_cookie_rcu(struct rcu_head *p)
+{
+ struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
+
+ kfree(cookie->data);
+ kfree(cookie);
+}
+
+static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
+ struct tc_cookie *new_cookie)
+{
+ struct tc_cookie *old;
+
+ old = xchg((__force struct tc_cookie **)old_cookie, new_cookie);
+ if (old)
+ call_rcu(&old->rcu, tcf_free_cookie_rcu);
+}
+
+/* XXX: For standalone actions, we don't need a RCU grace period either, because
+ * actions are always connected to filters and filters are already destroyed in
+ * RCU callbacks, so after a RCU grace period actions are already disconnected
+ * from filters. Readers later can not find us.
+ */
+static void free_tcf(struct tc_action *p)
+{
+ free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_qstats);
+
+ tcf_set_action_cookie(&p->act_cookie, NULL);
+ if (p->goto_chain)
+ tcf_action_goto_chain_fini(p);
+
+ kfree(p);
+}
+
+static void tcf_action_cleanup(struct tc_action *p)
+{
+ if (p->ops->cleanup)
+ p->ops->cleanup(p);
+
+ gen_kill_estimator(&p->tcfa_rate_est);
+ free_tcf(p);
+}
+
+static int __tcf_action_put(struct tc_action *p, bool bind)
+{
+ struct tcf_idrinfo *idrinfo = p->idrinfo;
+
+ if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+ idr_remove(&idrinfo->action_idr, p->tcfa_index);
+ spin_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ return 1;
+ }
+
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+
+ return 0;
+}
+
+int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
+{
+ int ret = 0;
+
+ /* Release with strict==1 and bind==0 is only called through act API
+ * interface (classifiers always bind). Only case when action with
+ * positive reference count and zero bind count can exist is when it was
+ * also created with act API (unbinding last classifier will destroy the
+ * action if it was created by classifier). So only case when bind count
+ * can be changed after initial check is when unbound action is
+ * destroyed by act API while classifier binds to action with same id
+ * concurrently. This result either creation of new action(same behavior
+ * as before), or reusing existing action if concurrent process
+ * increments reference count before action is deleted. Both scenarios
+ * are acceptable.
+ */
+ if (p) {
+ if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0)
+ return -EPERM;
+
+ if (__tcf_action_put(p, bind))
+ ret = ACT_P_DELETED;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(__tcf_idr_release);
+
+static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+{
+ struct tc_cookie *act_cookie;
+ u32 cookie_len = 0;
+
+ rcu_read_lock();
+ act_cookie = rcu_dereference(act->act_cookie);
+
+ if (act_cookie)
+ cookie_len = nla_total_size(act_cookie->len);
+ rcu_read_unlock();
+
+ return nla_total_size(0) /* action number nested */
+ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+ + cookie_len /* TCA_ACT_COOKIE */
+ + nla_total_size(0) /* TCA_ACT_STATS nested */
+ /* TCA_STATS_BASIC */
+ + nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+ /* TCA_STATS_QUEUE */
+ + nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+ + nla_total_size(0) /* TCA_OPTIONS nested */
+ + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */
+}
+
+static size_t tcf_action_full_attrs_size(size_t sz)
+{
+ return NLMSG_HDRLEN /* struct nlmsghdr */
+ + sizeof(struct tcamsg)
+ + nla_total_size(0) /* TCA_ACT_TAB nested */
+ + sz;
+}
+
+static size_t tcf_action_fill_size(const struct tc_action *act)
+{
+ size_t sz = tcf_action_shared_attrs_size(act);
+
+ if (act->ops->get_fill_size)
+ return act->ops->get_fill_size(act) + sz;
+ return sz;
+}
+
+static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int err = 0, index = -1, s_i = 0, n_i = 0;
+ u32 act_flags = cb->args[2];
+ unsigned long jiffy_since = cb->args[3];
+ struct nlattr *nest;
+ struct idr *idr = &idrinfo->action_idr;
+ struct tc_action *p;
+ unsigned long id = 1;
+
+ spin_lock(&idrinfo->lock);
+
+ s_i = cb->args[0];
+
+ idr_for_each_entry_ul(idr, p, id) {
+ index++;
+ if (index < s_i)
+ continue;
+
+ if (jiffy_since &&
+ time_after(jiffy_since,
+ (unsigned long)p->tcfa_tm.lastuse))
+ continue;
+
+ nest = nla_nest_start(skb, n_i);
+ if (!nest) {
+ index--;
+ goto nla_put_failure;
+ }
+ err = tcf_action_dump_1(skb, p, 0, 0);
+ if (err < 0) {
+ index--;
+ nlmsg_trim(skb, nest);
+ goto done;
+ }
+ nla_nest_end(skb, nest);
+ n_i++;
+ if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+ n_i >= TCA_ACT_MAX_PRIO)
+ goto done;
+ }
+done:
+ if (index >= 0)
+ cb->args[0] = index + 1;
+
+ spin_unlock(&idrinfo->lock);
+ if (n_i) {
+ if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+ cb->args[1] = n_i;
+ }
+ return n_i;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ goto done;
+}
+
+static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
+ const struct tc_action_ops *ops)
+{
+ struct nlattr *nest;
+ int n_i = 0;
+ int ret = -EINVAL;
+ struct idr *idr = &idrinfo->action_idr;
+ struct tc_action *p;
+ unsigned long id = 1;
+
+ nest = nla_nest_start(skb, 0);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_string(skb, TCA_KIND, ops->kind))
+ goto nla_put_failure;
+
+ idr_for_each_entry_ul(idr, p, id) {
+ ret = __tcf_idr_release(p, false, true);
+ if (ret == ACT_P_DELETED) {
+ module_put(ops->owner);
+ n_i++;
+ } else if (ret < 0) {
+ goto nla_put_failure;
+ }
+ }
+ if (nla_put_u32(skb, TCA_FCNT, n_i))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ return n_i;
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ if (type == RTM_DELACTION) {
+ return tcf_del_walker(idrinfo, skb, ops);
+ } else if (type == RTM_GETACTION) {
+ return tcf_dump_walker(idrinfo, skb, cb);
+ } else {
+ WARN(1, "tcf_generic_walker: unknown command %d\n", type);
+ NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command");
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(tcf_generic_walker);
+
+int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
+
+ spin_lock(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (IS_ERR(p))
+ p = NULL;
+ else if (p)
+ refcount_inc(&p->tcfa_refcnt);
+ spin_unlock(&idrinfo->lock);
+
+ if (p) {
+ *a = p;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcf_idr_search);
+
+static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
+{
+ struct tc_action *p;
+ int ret = 0;
+
+ spin_lock(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (!p) {
+ spin_unlock(&idrinfo->lock);
+ return -ENOENT;
+ }
+
+ if (!atomic_read(&p->tcfa_bindcnt)) {
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ struct module *owner = p->ops->owner;
+
+ WARN_ON(p != idr_remove(&idrinfo->action_idr,
+ p->tcfa_index));
+ spin_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ module_put(owner);
+ return 0;
+ }
+ ret = 0;
+ } else {
+ ret = -EPERM;
+ }
+
+ spin_unlock(&idrinfo->lock);
+ return ret;
+}
+
+int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+ struct tc_action **a, const struct tc_action_ops *ops,
+ int bind, bool cpustats)
+{
+ struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ int err = -ENOMEM;
+
+ if (unlikely(!p))
+ return -ENOMEM;
+ refcount_set(&p->tcfa_refcnt, 1);
+ if (bind)
+ atomic_set(&p->tcfa_bindcnt, 1);
+
+ if (cpustats) {
+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats)
+ goto err1;
+ p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!p->cpu_qstats)
+ goto err2;
+ }
+ spin_lock_init(&p->tcfa_lock);
+ p->tcfa_index = index;
+ p->tcfa_tm.install = jiffies;
+ p->tcfa_tm.lastuse = jiffies;
+ p->tcfa_tm.firstuse = 0;
+ if (est) {
+ err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
+ &p->tcfa_rate_est,
+ &p->tcfa_lock, NULL, est);
+ if (err)
+ goto err3;
+ }
+
+ p->idrinfo = idrinfo;
+ p->ops = ops;
+ *a = p;
+ return 0;
+err3:
+ free_percpu(p->cpu_qstats);
+err2:
+ free_percpu(p->cpu_bstats);
+err1:
+ kfree(p);
+ return err;
+}
+EXPORT_SYMBOL(tcf_idr_create);
+
+void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ spin_lock(&idrinfo->lock);
+ /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
+ spin_unlock(&idrinfo->lock);
+}
+EXPORT_SYMBOL(tcf_idr_insert);
+
+/* Cleanup idr index that was allocated but not initialized. */
+
+void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ spin_lock(&idrinfo->lock);
+ /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
+ spin_unlock(&idrinfo->lock);
+}
+EXPORT_SYMBOL(tcf_idr_cleanup);
+
+/* Check if action with specified index exists. If actions is found, increments
+ * its reference and bind counters, and return 1. Otherwise insert temporary
+ * error pointer (to prevent concurrent users from inserting actions with same
+ * index) and return 0.
+ */
+
+int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
+ struct tc_action **a, int bind)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
+ int ret;
+
+again:
+ spin_lock(&idrinfo->lock);
+ if (*index) {
+ p = idr_find(&idrinfo->action_idr, *index);
+ if (IS_ERR(p)) {
+ /* This means that another process allocated
+ * index but did not assign the pointer yet.
+ */
+ spin_unlock(&idrinfo->lock);
+ goto again;
+ }
+
+ if (p) {
+ refcount_inc(&p->tcfa_refcnt);
+ if (bind)
+ atomic_inc(&p->tcfa_bindcnt);
+ *a = p;
+ ret = 1;
+ } else {
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ *index, GFP_ATOMIC);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr,
+ ERR_PTR(-EBUSY), *index);
+ }
+ } else {
+ *index = 1;
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ UINT_MAX, GFP_ATOMIC);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
+ *index);
+ }
+ spin_unlock(&idrinfo->lock);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_check_alloc);
+
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+ struct tcf_idrinfo *idrinfo)
+{
+ struct idr *idr = &idrinfo->action_idr;
+ struct tc_action *p;
+ int ret;
+ unsigned long id = 1;
+
+ idr_for_each_entry_ul(idr, p, id) {
+ ret = __tcf_idr_release(p, false, true);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ else if (ret < 0)
+ return;
+ }
+ idr_destroy(&idrinfo->action_idr);
+}
+EXPORT_SYMBOL(tcf_idrinfo_destroy);
+
+static LIST_HEAD(act_base);
+static DEFINE_RWLOCK(act_mod_lock);
+
+int tcf_register_action(struct tc_action_ops *act,
+ struct pernet_operations *ops)
+{
+ struct tc_action_ops *a;
+ int ret;
+
+ if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
+ return -EINVAL;
+
+ /* We have to register pernet ops before making the action ops visible,
+ * otherwise tcf_action_init_1() could get a partially initialized
+ * netns.
+ */
+ ret = register_pernet_subsys(ops);
+ if (ret)
+ return ret;
+
+ write_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+ write_unlock(&act_mod_lock);
+ unregister_pernet_subsys(ops);
+ return -EEXIST;
+ }
+ }
+ list_add_tail(&act->head, &act_base);
+ write_unlock(&act_mod_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_register_action);
+
+int tcf_unregister_action(struct tc_action_ops *act,
+ struct pernet_operations *ops)
+{
+ struct tc_action_ops *a;
+ int err = -ENOENT;
+
+ write_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (a == act) {
+ list_del(&act->head);
+ err = 0;
+ break;
+ }
+ }
+ write_unlock(&act_mod_lock);
+ if (!err)
+ unregister_pernet_subsys(ops);
+ return err;
+}
+EXPORT_SYMBOL(tcf_unregister_action);
+
+/* lookup by name */
+static struct tc_action_ops *tc_lookup_action_n(char *kind)
+{
+ struct tc_action_ops *a, *res = NULL;
+
+ if (kind) {
+ read_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (strcmp(kind, a->kind) == 0) {
+ if (try_module_get(a->owner))
+ res = a;
+ break;
+ }
+ }
+ read_unlock(&act_mod_lock);
+ }
+ return res;
+}
+
+/* lookup by nlattr */
+static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
+{
+ struct tc_action_ops *a, *res = NULL;
+
+ if (kind) {
+ read_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (nla_strcmp(kind, a->kind) == 0) {
+ if (try_module_get(a->owner))
+ res = a;
+ break;
+ }
+ }
+ read_unlock(&act_mod_lock);
+ }
+ return res;
+}
+
+/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */
+#define TCA_ACT_MAX_PRIO_MASK 0x1FF
+int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
+ int nr_actions, struct tcf_result *res)
+{
+ u32 jmp_prgcnt = 0;
+ u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */
+ int i;
+ int ret = TC_ACT_OK;
+
+ if (skb_skip_tc_classify(skb))
+ return TC_ACT_OK;
+
+restart_act_graph:
+ for (i = 0; i < nr_actions; i++) {
+ const struct tc_action *a = actions[i];
+ int repeat_ttl;
+
+ if (jmp_prgcnt > 0) {
+ jmp_prgcnt -= 1;
+ continue;
+ }
+
+ repeat_ttl = 32;
+repeat:
+ ret = a->ops->act(skb, a, res);
+
+ if (unlikely(ret == TC_ACT_REPEAT)) {
+ if (--repeat_ttl != 0)
+ goto repeat;
+ /* suspicious opcode, stop pipeline */
+ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n");
+ return TC_ACT_OK;
+ }
+
+ if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) {
+ jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK;
+ if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) {
+ /* faulty opcode, stop pipeline */
+ return TC_ACT_OK;
+ } else {
+ jmp_ttl -= 1;
+ if (jmp_ttl > 0)
+ goto restart_act_graph;
+ else /* faulty graph, stop pipeline */
+ return TC_ACT_OK;
+ }
+ } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) {
+ tcf_action_goto_chain_exec(a, res);
+ }
+
+ if (ret != TC_ACT_PIPE)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(tcf_action_exec);
+
+int tcf_action_destroy(struct tc_action *actions[], int bind)
+{
+ const struct tc_action_ops *ops;
+ struct tc_action *a;
+ int ret = 0, i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
+ actions[i] = NULL;
+ ops = a->ops;
+ ret = __tcf_idr_release(a, bind, true);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ else if (ret < 0)
+ return ret;
+ }
+ return ret;
+}
+
+static int tcf_action_destroy_1(struct tc_action *a, int bind)
+{
+ struct tc_action *actions[] = { a, NULL };
+
+ return tcf_action_destroy(actions, bind);
+}
+
+static int tcf_action_put(struct tc_action *p)
+{
+ return __tcf_action_put(p, false);
+}
+
+/* Put all actions in this array, skip those NULL's. */
+static void tcf_action_put_many(struct tc_action *actions[])
+{
+ int i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops;
+
+ if (!a)
+ continue;
+ ops = a->ops;
+ if (tcf_action_put(a))
+ module_put(ops->owner);
+ }
+}
+
+int
+tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ return a->ops->dump(skb, a, bind, ref);
+}
+
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ int err = -EINVAL;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+ struct tc_cookie *cookie;
+
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
+ if (tcf_action_copy_stats(skb, a, 0))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->act_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ }
+ rcu_read_unlock();
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
+ nla_nest_end(skb, nest);
+ return err;
+ }
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+EXPORT_SYMBOL(tcf_action_dump_1);
+
+int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
+ int bind, int ref)
+{
+ struct tc_action *a;
+ int err = -EINVAL, i;
+ struct nlattr *nest;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
+ nest = nla_nest_start(skb, i + 1);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_1(skb, a, bind, ref);
+ if (err < 0)
+ goto errout;
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+nla_put_failure:
+ err = -EINVAL;
+errout:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
+static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
+{
+ struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL);
+ if (!c)
+ return NULL;
+
+ c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+ if (!c->data) {
+ kfree(c);
+ return NULL;
+ }
+ c->len = nla_len(tb[TCA_ACT_COOKIE]);
+
+ return c;
+}
+
+static bool tcf_action_valid(int action)
+{
+ int opcode = TC_ACT_EXT_OPCODE(action);
+
+ if (!opcode)
+ return action <= TC_ACT_VALUE_MAX;
+ return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
+}
+
+struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
+ struct nlattr *nla, struct nlattr *est,
+ char *name, int ovr, int bind,
+ bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action *a;
+ struct tc_action_ops *a_o;
+ struct tc_cookie *cookie = NULL;
+ char act_name[IFNAMSIZ];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct nlattr *kind;
+ int err;
+
+ if (name == NULL) {
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ if (err < 0)
+ goto err_out;
+ err = -EINVAL;
+ kind = tb[TCA_ACT_KIND];
+ if (!kind) {
+ NL_SET_ERR_MSG(extack, "TC action kind must be specified");
+ goto err_out;
+ }
+ if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
+ NL_SET_ERR_MSG(extack, "TC action name too long");
+ goto err_out;
+ }
+ if (tb[TCA_ACT_COOKIE]) {
+ int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+ if (cklen > TC_COOKIE_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
+ goto err_out;
+ }
+
+ cookie = nla_memdup_cookie(tb);
+ if (!cookie) {
+ NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ } else {
+ if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
+ NL_SET_ERR_MSG(extack, "TC action name too long");
+ err = -EINVAL;
+ goto err_out;
+ }
+ }
+
+ a_o = tc_lookup_action_n(act_name);
+ if (a_o == NULL) {
+#ifdef CONFIG_MODULES
+ if (rtnl_held)
+ rtnl_unlock();
+ request_module("act_%s", act_name);
+ if (rtnl_held)
+ rtnl_lock();
+
+ a_o = tc_lookup_action_n(act_name);
+
+ /* We dropped the RTNL semaphore in order to
+ * perform the module load. So, even if we
+ * succeeded in loading the module we have to
+ * tell the caller to replay the request. We
+ * indicate this using -EAGAIN.
+ */
+ if (a_o != NULL) {
+ err = -EAGAIN;
+ goto err_mod;
+ }
+#endif
+ NL_SET_ERR_MSG(extack, "Failed to load TC action module");
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ /* backward compatibility for policer */
+ if (name == NULL)
+ err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
+ rtnl_held, extack);
+ else
+ err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
+ extack);
+ if (err < 0)
+ goto err_mod;
+
+ if (!name && tb[TCA_ACT_COOKIE])
+ tcf_set_action_cookie(&a->act_cookie, cookie);
+
+ /* module count goes up only when brand new policy is created
+ * if it exists and is only bound to in a_o->init() then
+ * ACT_P_CREATED is not returned (a zero is).
+ */
+ if (err != ACT_P_CREATED)
+ module_put(a_o->owner);
+
+ if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
+ err = tcf_action_goto_chain_init(a, tp);
+ if (err) {
+ tcf_action_destroy_1(a, bind);
+ NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
+ return ERR_PTR(err);
+ }
+ }
+
+ if (!tcf_action_valid(a->tcfa_action)) {
+ tcf_action_destroy_1(a, bind);
+ NL_SET_ERR_MSG(extack, "Invalid control action value");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!bind && ovr && err == ACT_P_CREATED)
+ refcount_set(&a->tcfa_refcnt, 2);
+
+ return a;
+
+err_mod:
+ module_put(a_o->owner);
+err_out:
+ if (cookie) {
+ kfree(cookie->data);
+ kfree(cookie);
+ }
+ return ERR_PTR(err);
+}
+
+/* Returns numbers of initialized actions or negative error. */
+
+int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
+ struct nlattr *est, char *name, int ovr, int bind,
+ struct tc_action *actions[], size_t *attr_size,
+ bool rtnl_held, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
+ size_t sz = 0;
+ int err;
+ int i;
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
+ rtnl_held, extack);
+ if (IS_ERR(act)) {
+ err = PTR_ERR(act);
+ goto err;
+ }
+ act->order = i;
+ sz += tcf_action_fill_size(act);
+ /* Start from index 0 */
+ actions[i - 1] = act;
+ }
+
+ *attr_size = tcf_action_full_attrs_size(sz);
+ return i - 1;
+
+err:
+ tcf_action_destroy(actions, bind);
+ return err;
+}
+
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
+ int compat_mode)
+{
+ int err = 0;
+ struct gnet_dump d;
+
+ if (p == NULL)
+ goto errout;
+
+ /* compat_mode being true specifies a call that is supposed
+ * to add additional backward compatibility statistic TLVs.
+ */
+ if (compat_mode) {
+ if (p->type == TCA_OLD_COMPAT)
+ err = gnet_stats_start_copy_compat(skb, 0,
+ TCA_STATS,
+ TCA_XSTATS,
+ &p->tcfa_lock, &d,
+ TCA_PAD);
+ else
+ return 0;
+ } else
+ err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
+ &p->tcfa_lock, &d, TCA_ACT_PAD);
+
+ if (err < 0)
+ goto errout;
+
+ if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, p->cpu_qstats,
+ &p->tcfa_qstats,
+ p->tcfa_qstats.qlen) < 0)
+ goto errout;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto errout;
+
+ return 0;
+
+errout:
+ return -1;
+}
+
+static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
+ u32 portid, u32 seq, u16 flags, int event, int bind,
+ int ref)
+{
+ struct tcamsg *t;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (!nest)
+ goto out_nlmsg_trim;
+
+ if (tcf_action_dump(skb, actions, bind, ref) < 0)
+ goto out_nlmsg_trim;
+
+ nla_nest_end(skb, nest);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int
+tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
+ struct tc_action *actions[], int event,
+ struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
+ 0, 1) <= 0) {
+ NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnl_unicast(skb, net, portid);
+}
+
+static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ const struct tc_action_ops *ops;
+ struct tc_action *a;
+ int index;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ if (err < 0)
+ goto err_out;
+
+ err = -EINVAL;
+ if (tb[TCA_ACT_INDEX] == NULL ||
+ nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) {
+ NL_SET_ERR_MSG(extack, "Invalid TC action index value");
+ goto err_out;
+ }
+ index = nla_get_u32(tb[TCA_ACT_INDEX]);
+
+ err = -EINVAL;
+ ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+ if (!ops) { /* could happen in batch of actions */
+ NL_SET_ERR_MSG(extack, "Specified TC action not found");
+ goto err_out;
+ }
+ err = -ENOENT;
+ if (ops->lookup(net, &a, index, extack) == 0)
+ goto err_mod;
+
+ module_put(ops->owner);
+ return a;
+
+err_mod:
+ module_put(ops->owner);
+err_out:
+ return ERR_PTR(err);
+}
+
+static int tca_action_flush(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid,
+ struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ unsigned char *b;
+ struct nlmsghdr *nlh;
+ struct tcamsg *t;
+ struct netlink_callback dcb;
+ struct nlattr *nest;
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ const struct tc_action_ops *ops;
+ struct nlattr *kind;
+ int err = -ENOMEM;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return err;
+
+ b = skb_tail_pointer(skb);
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ if (err < 0)
+ goto err_out;
+
+ err = -EINVAL;
+ kind = tb[TCA_ACT_KIND];
+ ops = tc_lookup_action(kind);
+ if (!ops) { /*some idjot trying to flush unknown action */
+ NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action");
+ goto err_out;
+ }
+
+ nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION,
+ sizeof(*t), 0);
+ if (!nlh) {
+ NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification");
+ goto out_module_put;
+ }
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (!nest) {
+ NL_SET_ERR_MSG(extack, "Failed to add new netlink message");
+ goto out_module_put;
+ }
+
+ err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack);
+ if (err <= 0) {
+ nla_nest_cancel(skb, nest);
+ goto out_module_put;
+ }
+
+ nla_nest_end(skb, nest);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ nlh->nlmsg_flags |= NLM_F_ROOT;
+ module_put(ops->owner);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ return 0;
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification");
+
+ return err;
+
+out_module_put:
+ module_put(ops->owner);
+err_out:
+ kfree_skb(skb);
+ return err;
+}
+
+static int tcf_action_delete(struct net *net, struct tc_action *actions[])
+{
+ int i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops = a->ops;
+ /* Actions can be deleted concurrently so we must save their
+ * type and id to search again after reference is released.
+ */
+ struct tcf_idrinfo *idrinfo = a->idrinfo;
+ u32 act_index = a->tcfa_index;
+
+ actions[i] = NULL;
+ if (tcf_action_put(a)) {
+ /* last reference, action was deleted concurrently */
+ module_put(ops->owner);
+ } else {
+ int ret;
+
+ /* now do the delete */
+ ret = tcf_idr_delete_index(idrinfo, act_index);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
+ u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
+ GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
+ 0, 2) <= 0) {
+ NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* now do the delete */
+ ret = tcf_action_delete(net, actions);
+ if (ret < 0) {
+ NL_SET_ERR_MSG(extack, "Failed to delete TC action");
+ kfree_skb(skb);
+ return ret;
+ }
+
+ ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (ret > 0)
+ return 0;
+ return ret;
+}
+
+static int
+tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+ u32 portid, int event, struct netlink_ext_ack *extack)
+{
+ int i, ret;
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
+ size_t attr_size = 0;
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
+
+ ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ if (ret < 0)
+ return ret;
+
+ if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
+ if (tb[1])
+ return tca_action_flush(net, tb[1], n, portid, extack);
+
+ NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action");
+ return -EINVAL;
+ }
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ act = tcf_action_get_1(net, tb[i], n, portid, extack);
+ if (IS_ERR(act)) {
+ ret = PTR_ERR(act);
+ goto err;
+ }
+ attr_size += tcf_action_fill_size(act);
+ actions[i - 1] = act;
+ }
+
+ attr_size = tcf_action_full_attrs_size(attr_size);
+
+ if (event == RTM_GETACTION)
+ ret = tcf_get_notify(net, portid, n, actions, event, extack);
+ else { /* delete */
+ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack);
+ if (ret)
+ goto err;
+ return 0;
+ }
+err:
+ tcf_action_put_many(actions);
+ return ret;
+}
+
+static int
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
+ u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
+ GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
+ RTM_NEWACTION, 0, 0) <= 0) {
+ NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int tcf_action_add(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid, int ovr,
+ struct netlink_ext_ack *extack)
+{
+ size_t attr_size = 0;
+ int loop, ret;
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
+
+ for (loop = 0; loop < 10; loop++) {
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
+ actions, &attr_size, true, extack);
+ if (ret != -EAGAIN)
+ break;
+ }
+
+ if (ret < 0)
+ return ret;
+ ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
+ if (ovr)
+ tcf_action_put_many(actions);
+
+ return ret;
+}
+
+static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
+static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
+ [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
+ .validation_data = &tcaa_root_flags_allowed },
+ [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 },
+};
+
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_ROOT_MAX + 1];
+ u32 portid = skb ? NETLINK_CB(skb).portid : 0;
+ int ret = 0, ovr = 0;
+
+ if ((n->nlmsg_type != RTM_GETACTION) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
+ extack);
+ if (ret < 0)
+ return ret;
+
+ if (tca[TCA_ACT_TAB] == NULL) {
+ NL_SET_ERR_MSG(extack, "Netlink action attributes missing");
+ return -EINVAL;
+ }
+
+ /* n->nlmsg_flags & NLM_F_CREATE */
+ switch (n->nlmsg_type) {
+ case RTM_NEWACTION:
+ /* we are going to assume all other flags
+ * imply create only if it doesn't exist
+ * Note that CREATE | EXCL implies that
+ * but since we want avoid ambiguity (eg when flags
+ * is zero) then just set this
+ */
+ if (n->nlmsg_flags & NLM_F_REPLACE)
+ ovr = 1;
+ ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+ extack);
+ break;
+ case RTM_DELACTION:
+ ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+ portid, RTM_DELACTION, extack);
+ break;
+ case RTM_GETACTION:
+ ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+ portid, RTM_GETACTION, extack);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+static struct nlattr *find_dump_kind(struct nlattr **nla)
+{
+ struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct nlattr *kind;
+
+ tb1 = nla[TCA_ACT_TAB];
+ if (tb1 == NULL)
+ return NULL;
+
+ if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
+ NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0)
+ return NULL;
+
+ if (tb[1] == NULL)
+ return NULL;
+ if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+ return NULL;
+ kind = tb2[TCA_ACT_KIND];
+
+ return kind;
+}
+
+static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+ struct tc_action_ops *a_o;
+ int ret = 0;
+ struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
+ struct nlattr *tb[TCA_ROOT_MAX + 1];
+ struct nlattr *count_attr = NULL;
+ unsigned long jiffy_since = 0;
+ struct nlattr *kind = NULL;
+ struct nla_bitfield32 bf;
+ u32 msecs_since = 0;
+ u32 act_count = 0;
+
+ ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
+ tcaa_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ kind = find_dump_kind(tb);
+ if (kind == NULL) {
+ pr_info("tc_dump_action: action bad kind\n");
+ return 0;
+ }
+
+ a_o = tc_lookup_action(kind);
+ if (a_o == NULL)
+ return 0;
+
+ cb->args[2] = 0;
+ if (tb[TCA_ROOT_FLAGS]) {
+ bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]);
+ cb->args[2] = bf.value;
+ }
+
+ if (tb[TCA_ROOT_TIME_DELTA]) {
+ msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]);
+ }
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*t), 0);
+ if (!nlh)
+ goto out_module_put;
+
+ if (msecs_since)
+ jiffy_since = jiffies - msecs_to_jiffies(msecs_since);
+
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+ cb->args[3] = jiffy_since;
+ count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
+ if (!count_attr)
+ goto out_module_put;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (nest == NULL)
+ goto out_module_put;
+
+ ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL);
+ if (ret < 0)
+ goto out_module_put;
+
+ if (ret > 0) {
+ nla_nest_end(skb, nest);
+ ret = skb->len;
+ act_count = cb->args[1];
+ memcpy(nla_data(count_attr), &act_count, sizeof(u32));
+ cb->args[1] = 0;
+ } else
+ nlmsg_trim(skb, b);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ if (NETLINK_CB(cb->skb).portid && ret)
+ nlh->nlmsg_flags |= NLM_F_MULTI;
+ module_put(a_o->owner);
+ return skb->len;
+
+out_module_put:
+ module_put(a_o->owner);
+ nlmsg_trim(skb, b);
+ return skb->len;
+}
+
+struct tcf_action_net {
+ struct rhashtable egdev_ht;
+};
+
+static unsigned int tcf_action_net_id;
+
+struct tcf_action_egdev_cb {
+ struct list_head list;
+ tc_setup_cb_t *cb;
+ void *cb_priv;
+};
+
+struct tcf_action_egdev {
+ struct rhash_head ht_node;
+ const struct net_device *dev;
+ unsigned int refcnt;
+ struct list_head cb_list;
+};
+
+static const struct rhashtable_params tcf_action_egdev_ht_params = {
+ .key_offset = offsetof(struct tcf_action_egdev, dev),
+ .head_offset = offsetof(struct tcf_action_egdev, ht_node),
+ .key_len = sizeof(const struct net_device *),
+};
+
+static struct tcf_action_egdev *
+tcf_action_egdev_lookup(const struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+ return rhashtable_lookup_fast(&tan->egdev_ht, &dev,
+ tcf_action_egdev_ht_params);
+}
+
+static struct tcf_action_egdev *
+tcf_action_egdev_get(const struct net_device *dev)
+{
+ struct tcf_action_egdev *egdev;
+ struct tcf_action_net *tan;
+
+ egdev = tcf_action_egdev_lookup(dev);
+ if (egdev)
+ goto inc_ref;
+
+ egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
+ if (!egdev)
+ return NULL;
+ INIT_LIST_HEAD(&egdev->cb_list);
+ egdev->dev = dev;
+ tan = net_generic(dev_net(dev), tcf_action_net_id);
+ rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node,
+ tcf_action_egdev_ht_params);
+
+inc_ref:
+ egdev->refcnt++;
+ return egdev;
+}
+
+static void tcf_action_egdev_put(struct tcf_action_egdev *egdev)
+{
+ struct tcf_action_net *tan;
+
+ if (--egdev->refcnt)
+ return;
+ tan = net_generic(dev_net(egdev->dev), tcf_action_net_id);
+ rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node,
+ tcf_action_egdev_ht_params);
+ kfree(egdev);
+}
+
+static struct tcf_action_egdev_cb *
+tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ struct tcf_action_egdev_cb *egdev_cb;
+
+ list_for_each_entry(egdev_cb, &egdev->cb_list, list)
+ if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv)
+ return egdev_cb;
+ return NULL;
+}
+
+static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev,
+ enum tc_setup_type type,
+ void *type_data, bool err_stop)
+{
+ struct tcf_action_egdev_cb *egdev_cb;
+ int ok_count = 0;
+ int err;
+
+ list_for_each_entry(egdev_cb, &egdev->cb_list, list) {
+ err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv);
+ if (err) {
+ if (err_stop)
+ return err;
+ } else {
+ ok_count++;
+ }
+ }
+ return ok_count;
+}
+
+static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ struct tcf_action_egdev_cb *egdev_cb;
+
+ egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
+ if (WARN_ON(egdev_cb))
+ return -EEXIST;
+ egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL);
+ if (!egdev_cb)
+ return -ENOMEM;
+ egdev_cb->cb = cb;
+ egdev_cb->cb_priv = cb_priv;
+ list_add(&egdev_cb->list, &egdev->cb_list);
+ return 0;
+}
+
+static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ struct tcf_action_egdev_cb *egdev_cb;
+
+ egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
+ if (WARN_ON(!egdev_cb))
+ return;
+ list_del(&egdev_cb->list);
+ kfree(egdev_cb);
+}
+
+static int __tc_setup_cb_egdev_register(const struct net_device *dev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev);
+ int err;
+
+ if (!egdev)
+ return -ENOMEM;
+ err = tcf_action_egdev_cb_add(egdev, cb, cb_priv);
+ if (err)
+ goto err_cb_add;
+ return 0;
+
+err_cb_add:
+ tcf_action_egdev_put(egdev);
+ return err;
+}
+int tc_setup_cb_egdev_register(const struct net_device *dev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ int err;
+
+ rtnl_lock();
+ err = __tc_setup_cb_egdev_register(dev, cb, cb_priv);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register);
+
+static void __tc_setup_cb_egdev_unregister(const struct net_device *dev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
+
+ if (WARN_ON(!egdev))
+ return;
+ tcf_action_egdev_cb_del(egdev, cb, cb_priv);
+ tcf_action_egdev_put(egdev);
+}
+void tc_setup_cb_egdev_unregister(const struct net_device *dev,
+ tc_setup_cb_t *cb, void *cb_priv)
+{
+ rtnl_lock();
+ __tc_setup_cb_egdev_unregister(dev, cb, cb_priv);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister);
+
+int tc_setup_cb_egdev_call(const struct net_device *dev,
+ enum tc_setup_type type, void *type_data,
+ bool err_stop)
+{
+ struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
+
+ if (!egdev)
+ return 0;
+ return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);
+
+static __net_init int tcf_action_net_init(struct net *net)
+{
+ struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+ return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params);
+}
+
+static void __net_exit tcf_action_net_exit(struct net *net)
+{
+ struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+ rhashtable_destroy(&tan->egdev_ht);
+}
+
+static struct pernet_operations tcf_action_net_ops = {
+ .init = tcf_action_net_init,
+ .exit = tcf_action_net_exit,
+ .id = &tcf_action_net_id,
+ .size = sizeof(struct tcf_action_net),
+};
+
+static int __init tc_action_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&tcf_action_net_ops);
+ if (err)
+ return err;
+
+ rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
+ 0);
+
+ return 0;
+}
+
+subsys_initcall(tc_action_init);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
new file mode 100644
index 000000000..800846d77
--- /dev/null
+++ b/net/sched/act_bpf.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_bpf.h>
+#include <net/tc_act/tc_bpf.h>
+
+#define ACT_BPF_NAME_LEN 256
+
+struct tcf_bpf_cfg {
+ struct bpf_prog *filter;
+ struct sock_filter *bpf_ops;
+ const char *bpf_name;
+ u16 bpf_num_ops;
+ bool is_ebpf;
+};
+
+static unsigned int bpf_net_id;
+static struct tc_action_ops act_bpf_ops;
+
+static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
+ struct tcf_result *res)
+{
+ bool at_ingress = skb_at_tc_ingress(skb);
+ struct tcf_bpf *prog = to_bpf(act);
+ struct bpf_prog *filter;
+ int action, filter_res;
+
+ tcf_lastuse_update(&prog->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
+
+ rcu_read_lock();
+ filter = rcu_dereference(prog->filter);
+ if (at_ingress) {
+ __skb_push(skb, skb->mac_len);
+ bpf_compute_data_pointers(skb);
+ filter_res = BPF_PROG_RUN(filter, skb);
+ __skb_pull(skb, skb->mac_len);
+ } else {
+ bpf_compute_data_pointers(skb);
+ filter_res = BPF_PROG_RUN(filter, skb);
+ }
+ rcu_read_unlock();
+
+ /* A BPF program may overwrite the default action opcode.
+ * Similarly as in cls_bpf, if filter_res == -1 we use the
+ * default action specified from tc.
+ *
+ * In case a different well-known TC_ACT opcode has been
+ * returned, it will overwrite the default one.
+ *
+ * For everything else that is unkown, TC_ACT_UNSPEC is
+ * returned.
+ */
+ switch (filter_res) {
+ case TC_ACT_PIPE:
+ case TC_ACT_RECLASSIFY:
+ case TC_ACT_OK:
+ case TC_ACT_REDIRECT:
+ action = filter_res;
+ break;
+ case TC_ACT_SHOT:
+ action = filter_res;
+ qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats));
+ break;
+ case TC_ACT_UNSPEC:
+ action = prog->tcf_action;
+ break;
+ default:
+ action = TC_ACT_UNSPEC;
+ break;
+ }
+
+ return action;
+}
+
+static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog)
+{
+ return !prog->bpf_ops;
+}
+
+static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ return 0;
+}
+
+static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (prog->bpf_name &&
+ nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_ACT_BPF_ID, prog->filter->aux->id))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
+ return 0;
+}
+
+static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
+ int bind, int ref)
+{
+ unsigned char *tp = skb_tail_pointer(skb);
+ struct tcf_bpf *prog = to_bpf(act);
+ struct tc_act_bpf opt = {
+ .index = prog->tcf_index,
+ .refcnt = refcount_read(&prog->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
+ };
+ struct tcf_t tm;
+ int ret;
+
+ spin_lock_bh(&prog->tcf_lock);
+ opt.action = prog->tcf_action;
+ if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (tcf_bpf_is_ebpf(prog))
+ ret = tcf_bpf_dump_ebpf_info(prog, skb);
+ else
+ ret = tcf_bpf_dump_bpf_info(prog, skb);
+ if (ret)
+ goto nla_put_failure;
+
+ tcf_tm_dump(&tm, &prog->tcf_tm);
+ if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
+ TCA_ACT_BPF_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&prog->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&prog->tcf_lock);
+ nlmsg_trim(skb, tp);
+ return -1;
+}
+
+static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
+ [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) },
+ [TCA_ACT_BPF_FD] = { .type = NLA_U32 },
+ [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = ACT_BPF_NAME_LEN },
+ [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
+{
+ struct sock_filter *bpf_ops;
+ struct sock_fprog_kern fprog_tmp;
+ struct bpf_prog *fp;
+ u16 bpf_size, bpf_num_ops;
+ int ret;
+
+ bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]);
+ if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
+ return -EINVAL;
+
+ bpf_size = bpf_num_ops * sizeof(*bpf_ops);
+ if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
+ return -EINVAL;
+
+ bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL)
+ return -ENOMEM;
+
+ fprog_tmp.len = bpf_num_ops;
+ fprog_tmp.filter = bpf_ops;
+
+ ret = bpf_prog_create(&fp, &fprog_tmp);
+ if (ret < 0) {
+ kfree(bpf_ops);
+ return ret;
+ }
+
+ cfg->bpf_ops = bpf_ops;
+ cfg->bpf_num_ops = bpf_num_ops;
+ cfg->filter = fp;
+ cfg->is_ebpf = false;
+
+ return 0;
+}
+
+static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
+{
+ struct bpf_prog *fp;
+ char *name = NULL;
+ u32 bpf_fd;
+
+ bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);
+
+ fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ if (tb[TCA_ACT_BPF_NAME]) {
+ name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL);
+ if (!name) {
+ bpf_prog_put(fp);
+ return -ENOMEM;
+ }
+ }
+
+ cfg->bpf_name = name;
+ cfg->filter = fp;
+ cfg->is_ebpf = true;
+
+ return 0;
+}
+
+static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg)
+{
+ struct bpf_prog *filter = cfg->filter;
+
+ if (filter) {
+ if (cfg->is_ebpf)
+ bpf_prog_put(filter);
+ else
+ bpf_prog_destroy(filter);
+ }
+
+ kfree(cfg->bpf_ops);
+ kfree(cfg->bpf_name);
+}
+
+static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
+ struct tcf_bpf_cfg *cfg)
+{
+ cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
+ /* updates to prog->filter are prevented, since it's called either
+ * with tcf lock or during final cleanup in rcu callback
+ */
+ cfg->filter = rcu_dereference_protected(prog->filter, 1);
+
+ cfg->bpf_ops = prog->bpf_ops;
+ cfg->bpf_name = prog->bpf_name;
+}
+
+static int tcf_bpf_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **act,
+ int replace, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+ struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+ struct tcf_bpf_cfg cfg, old;
+ struct tc_act_bpf *parm;
+ struct tcf_bpf *prog;
+ bool is_bpf, is_ebpf;
+ int ret, res = 0;
+ u32 index;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[TCA_ACT_BPF_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, act, bind);
+ if (!ret) {
+ ret = tcf_idr_create(tn, index, est, act,
+ &act_bpf_ops, bind, true);
+ if (ret < 0) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ res = ACT_P_CREATED;
+ } else if (ret > 0) {
+ /* Don't override defaults. */
+ if (bind)
+ return 0;
+
+ if (!replace) {
+ tcf_idr_release(*act, bind);
+ return -EEXIST;
+ }
+ } else {
+ return ret;
+ }
+
+ is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
+ is_ebpf = tb[TCA_ACT_BPF_FD];
+
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ memset(&cfg, 0, sizeof(cfg));
+
+ ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
+ tcf_bpf_init_from_efd(tb, &cfg);
+ if (ret < 0)
+ goto out;
+
+ prog = to_bpf(*act);
+
+ spin_lock_bh(&prog->tcf_lock);
+ if (res != ACT_P_CREATED)
+ tcf_bpf_prog_fill_cfg(prog, &old);
+
+ prog->bpf_ops = cfg.bpf_ops;
+ prog->bpf_name = cfg.bpf_name;
+
+ if (cfg.bpf_num_ops)
+ prog->bpf_num_ops = cfg.bpf_num_ops;
+
+ prog->tcf_action = parm->action;
+ rcu_assign_pointer(prog->filter, cfg.filter);
+ spin_unlock_bh(&prog->tcf_lock);
+
+ if (res == ACT_P_CREATED) {
+ tcf_idr_insert(tn, *act);
+ } else {
+ /* make sure the program being replaced is no longer executing */
+ synchronize_rcu();
+ tcf_bpf_cfg_cleanup(&old);
+ }
+
+ return res;
+out:
+ tcf_idr_release(*act, bind);
+
+ return ret;
+}
+
+static void tcf_bpf_cleanup(struct tc_action *act)
+{
+ struct tcf_bpf_cfg tmp;
+
+ tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp);
+ tcf_bpf_cfg_cleanup(&tmp);
+}
+
+static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .type = TCA_ACT_BPF,
+ .owner = THIS_MODULE,
+ .act = tcf_bpf_act,
+ .dump = tcf_bpf_dump,
+ .cleanup = tcf_bpf_cleanup,
+ .init = tcf_bpf_init,
+ .walk = tcf_bpf_walker,
+ .lookup = tcf_bpf_search,
+ .size = sizeof(struct tcf_bpf),
+};
+
+static __net_init int bpf_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tc_action_net_init(net, tn, &act_bpf_ops);
+}
+
+static void __net_exit bpf_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, bpf_net_id);
+}
+
+static struct pernet_operations bpf_net_ops = {
+ .init = bpf_init_net,
+ .exit_batch = bpf_exit_net,
+ .id = &bpf_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init bpf_init_module(void)
+{
+ return tcf_register_action(&act_bpf_ops, &bpf_net_ops);
+}
+
+static void __exit bpf_cleanup_module(void)
+{
+ tcf_unregister_action(&act_bpf_ops, &bpf_net_ops);
+}
+
+module_init(bpf_init_module);
+module_exit(bpf_cleanup_module);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("TC BPF based action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
new file mode 100644
index 000000000..6485f421c
--- /dev/null
+++ b/net/sched/act_connmark.c
@@ -0,0 +1,250 @@
+/*
+ * net/sched/act_connmark.c netfilter connmark retriever action
+ * skb mark is over-written
+ *
+ * Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_connmark.h>
+#include <net/tc_act/tc_connmark.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static unsigned int connmark_net_id;
+static struct tc_action_ops act_connmark_ops;
+
+static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_connmark_info *ca = to_connmark(a);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *c;
+ int proto;
+
+ spin_lock(&ca->tcf_lock);
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ if (skb->len < sizeof(struct iphdr))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ if (skb->len < sizeof(struct ipv6hdr))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ break;
+ default:
+ goto out;
+ }
+
+ c = nf_ct_get(skb, &ctinfo);
+ if (c) {
+ skb->mark = c->mark;
+ /* using overlimits stats to count how many packets marked */
+ ca->tcf_qstats.overlimits++;
+ goto out;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, ca->net, &tuple))
+ goto out;
+
+ zone.id = ca->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ c = nf_ct_tuplehash_to_ctrack(thash);
+ /* using overlimits stats to count how many packets marked */
+ ca->tcf_qstats.overlimits++;
+ skb->mark = c->mark;
+ nf_ct_put(c);
+
+out:
+ spin_unlock(&ca->tcf_lock);
+ return ca->tcf_action;
+}
+
+static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
+ [TCA_CONNMARK_PARMS] = { .len = sizeof(struct tc_connmark) },
+};
+
+static int tcf_connmark_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+ struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+ struct tcf_connmark_info *ci;
+ struct tc_connmark *parm;
+ int ret = 0;
+ u32 index;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[TCA_CONNMARK_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!ret) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_connmark_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ci = to_connmark(*a);
+ ci->tcf_action = parm->action;
+ ci->net = net;
+ ci->zone = parm->zone;
+
+ tcf_idr_insert(tn, *a);
+ ret = ACT_P_CREATED;
+ } else if (ret > 0) {
+ ci = to_connmark(*a);
+ if (bind)
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ /* replacing action and zone */
+ ci->tcf_action = parm->action;
+ ci->zone = parm->zone;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_connmark_info *ci = to_connmark(a);
+
+ struct tc_connmark opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ .action = ci->tcf_action,
+ .zone = ci->zone,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
+ TCA_CONNMARK_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_connmark_ops = {
+ .kind = "connmark",
+ .type = TCA_ACT_CONNMARK,
+ .owner = THIS_MODULE,
+ .act = tcf_connmark_act,
+ .dump = tcf_connmark_dump,
+ .init = tcf_connmark_init,
+ .walk = tcf_connmark_walker,
+ .lookup = tcf_connmark_search,
+ .size = sizeof(struct tcf_connmark_info),
+};
+
+static __net_init int connmark_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tc_action_net_init(net, tn, &act_connmark_ops);
+}
+
+static void __net_exit connmark_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, connmark_net_id);
+}
+
+static struct pernet_operations connmark_net_ops = {
+ .init = connmark_init_net,
+ .exit_batch = connmark_exit_net,
+ .id = &connmark_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init connmark_init_module(void)
+{
+ return tcf_register_action(&act_connmark_ops, &connmark_net_ops);
+}
+
+static void __exit connmark_cleanup_module(void)
+{
+ tcf_unregister_action(&act_connmark_ops, &connmark_net_ops);
+}
+
+module_init(connmark_init_module);
+module_exit(connmark_cleanup_module);
+MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
+MODULE_DESCRIPTION("Connection tracking mark restoring");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 000000000..24ad4ceac
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,737 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+#include <net/sctp/checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+ [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static unsigned int csum_net_id;
+static struct tc_action_ops act_csum_ops;
+
+static int tcf_csum_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a, int ovr,
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+ struct tcf_csum_params *params_new;
+ struct nlattr *tb[TCA_CSUM_MAX + 1];
+ struct tc_csum *parm;
+ struct tcf_csum *p;
+ int ret = 0, err;
+ u32 index;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CSUM_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_CSUM_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_csum_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind)/* dont override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ p = to_tcf_csum(*a);
+
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
+ params_new->update_flags = parm->update_flags;
+
+ spin_lock_bh(&p->tcf_lock);
+ p->tcf_action = parm->action;
+ rcu_swap_protected(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (params_new)
+ kfree_rcu(params_new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl,
+ unsigned int jhl)
+{
+ int ntkoff = skb_network_offset(skb);
+ int hl = ihl + jhl;
+
+ if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+ skb_try_make_writable(skb, hl + ntkoff))
+ return NULL;
+ else
+ return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct icmphdr *icmph;
+
+ icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+ if (icmph == NULL)
+ return 0;
+
+ icmph->checksum = 0;
+ skb->csum = csum_partial(icmph, ipl - ihl, 0);
+ icmph->checksum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct igmphdr *igmph;
+
+ igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+ if (igmph == NULL)
+ return 0;
+
+ igmph->csum = 0;
+ skb->csum = csum_partial(igmph, ipl - ihl, 0);
+ igmph->csum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct icmp6hdr *icmp6h;
+ const struct ipv6hdr *ip6h;
+
+ icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+ if (icmp6h == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ icmp6h->icmp6_cksum = 0;
+ skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_ICMPV6,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct tcphdr *tcph;
+ const struct iphdr *iph;
+
+ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+ return 1;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ iph = ip_hdr(skb);
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = tcp_v4_check(ipl - ihl,
+ iph->saddr, iph->daddr, skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct tcphdr *tcph;
+ const struct ipv6hdr *ip6h;
+
+ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
+ return 1;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_TCP,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ const struct iphdr *iph;
+ u16 ul;
+
+ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ return 1;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use iph->tot_len, or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ iph = ip_hdr(skb);
+ ul = ntohs(udph->len);
+
+ if (udplite || udph->check) {
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ul, iph->protocol,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ const struct ipv6hdr *ip6h;
+ u16 ul;
+
+ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ return 1;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ ul = ntohs(udph->len);
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+ udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct sctphdr *sctph;
+
+ if (skb_is_gso(skb) && skb_is_gso_sctp(skb))
+ return 1;
+
+ sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
+ if (!sctph)
+ return 0;
+
+ sctph->checksum = sctp_compute_cksum(skb,
+ skb_network_offset(skb) + ihl);
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum_not_inet = 0;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+ const struct iphdr *iph;
+ int ntkoff;
+
+ ntkoff = skb_network_offset(skb);
+
+ if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+ goto fail;
+
+ iph = ip_hdr(skb);
+
+ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+ case IPPROTO_ICMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_IGMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+ if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len), 0))
+ goto fail;
+ break;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len), 1))
+ goto fail;
+ break;
+ case IPPROTO_SCTP:
+ if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+ !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ }
+
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+ if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff))
+ goto fail;
+
+ ip_send_check(ip_hdr(skb));
+ }
+
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
+ unsigned int *pl)
+{
+ int off, len, optlen;
+ unsigned char *xh = (void *)ip6xh;
+
+ off = sizeof(*ip6xh);
+ len = ixhl - off;
+
+ while (len > 1) {
+ switch (xh[off]) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ case IPV6_TLV_JUMBO:
+ optlen = xh[off + 1] + 2;
+ if (optlen != 6 || len < 6 || (off & 3) != 2)
+ /* wrong jumbo option length/alignment */
+ return 0;
+ *pl = ntohl(*(__be32 *)(xh + off + 2));
+ goto done;
+ default:
+ optlen = xh[off + 1] + 2;
+ if (optlen > len)
+ /* ignore obscure options */
+ goto done;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+
+done:
+ return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+ struct ipv6hdr *ip6h;
+ struct ipv6_opt_hdr *ip6xh;
+ unsigned int hl, ixhl;
+ unsigned int pl;
+ int ntkoff;
+ u8 nexthdr;
+
+ ntkoff = skb_network_offset(skb);
+
+ hl = sizeof(*ip6h);
+
+ if (!pskb_may_pull(skb, hl + ntkoff))
+ goto fail;
+
+ ip6h = ipv6_hdr(skb);
+
+ pl = ntohs(ip6h->payload_len);
+ nexthdr = ip6h->nexthdr;
+
+ do {
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ goto ignore_skb;
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+ case NEXTHDR_DEST:
+ if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+ goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
+ ixhl = ipv6_optlen(ip6xh);
+ if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+ goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
+ if ((nexthdr == NEXTHDR_HOP) &&
+ !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+ goto fail;
+ nexthdr = ip6xh->nexthdr;
+ hl += ixhl;
+ break;
+ case IPPROTO_ICMPV6:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv6_icmp(skb,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv6_tcp(skb,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv6_udp(skb, hl,
+ pl + sizeof(*ip6h), 0))
+ goto fail;
+ goto done;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv6_udp(skb, hl,
+ pl + sizeof(*ip6h), 1))
+ goto fail;
+ goto done;
+ case IPPROTO_SCTP:
+ if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+ !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ default:
+ goto ignore_skb;
+ }
+ } while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_csum *p = to_tcf_csum(a);
+ bool orig_vlan_tag_present = false;
+ unsigned int vlan_hdr_count = 0;
+ struct tcf_csum_params *params;
+ u32 update_flags;
+ __be16 protocol;
+ int action;
+
+ params = rcu_dereference_bh(p->params);
+
+ tcf_lastuse_update(&p->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+
+ action = READ_ONCE(p->tcf_action);
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ update_flags = params->update_flags;
+ protocol = skb_protocol(skb, false);
+again:
+ switch (protocol) {
+ case cpu_to_be16(ETH_P_IP):
+ if (!tcf_csum_ipv4(skb, update_flags))
+ goto drop;
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
+ if (!tcf_csum_ipv6(skb, update_flags))
+ goto drop;
+ break;
+ case cpu_to_be16(ETH_P_8021AD): /* fall through */
+ case cpu_to_be16(ETH_P_8021Q):
+ if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) {
+ protocol = skb->protocol;
+ orig_vlan_tag_present = true;
+ } else {
+ struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data;
+
+ protocol = vlan->h_vlan_encapsulated_proto;
+ skb_pull(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
+ vlan_hdr_count++;
+ }
+ goto again;
+ }
+
+out:
+ /* Restore the skb for the pulled VLAN tags */
+ while (vlan_hdr_count--) {
+ skb_push(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
+ }
+
+ return action;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
+ action = TC_ACT_SHOT;
+ goto out;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_csum *p = to_tcf_csum(a);
+ struct tcf_csum_params *params;
+ struct tc_csum opt = {
+ .index = p->tcf_index,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&p->tcf_lock);
+ params = rcu_dereference_protected(p->params,
+ lockdep_is_held(&p->tcf_lock));
+ opt.action = p->tcf_action;
+ opt.update_flags = params->update_flags;
+
+ if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &p->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void tcf_csum_cleanup(struct tc_action *a)
+{
+ struct tcf_csum *p = to_tcf_csum(a);
+ struct tcf_csum_params *params;
+
+ params = rcu_dereference_protected(p->params, 1);
+ if (params)
+ kfree_rcu(params, rcu);
+}
+
+static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static size_t tcf_csum_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_csum));
+}
+
+static struct tc_action_ops act_csum_ops = {
+ .kind = "csum",
+ .type = TCA_ACT_CSUM,
+ .owner = THIS_MODULE,
+ .act = tcf_csum_act,
+ .dump = tcf_csum_dump,
+ .init = tcf_csum_init,
+ .cleanup = tcf_csum_cleanup,
+ .walk = tcf_csum_walker,
+ .lookup = tcf_csum_search,
+ .get_fill_size = tcf_csum_get_fill_size,
+ .size = sizeof(struct tcf_csum),
+};
+
+static __net_init int csum_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tc_action_net_init(net, tn, &act_csum_ops);
+}
+
+static void __net_exit csum_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, csum_net_id);
+}
+
+static struct pernet_operations csum_net_ops = {
+ .init = csum_init_net,
+ .exit_batch = csum_exit_net,
+ .id = &csum_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+ return tcf_register_action(&act_csum_ops, &csum_net_ops);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+ tcf_unregister_action(&act_csum_ops, &csum_net_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
new file mode 100644
index 000000000..dfef96213
--- /dev/null
+++ b/net/sched/act_gact.c
@@ -0,0 +1,302 @@
+/*
+ * net/sched/act_gact.c Generic actions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2002-4)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_gact.h>
+#include <net/tc_act/tc_gact.h>
+
+static unsigned int gact_net_id;
+static struct tc_action_ops act_gact_ops;
+
+#ifdef CONFIG_GACT_PROB
+static int gact_net_rand(struct tcf_gact *gact)
+{
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (prandom_u32() % gact->tcfg_pval)
+ return gact->tcf_action;
+ return gact->tcfg_paction;
+}
+
+static int gact_determ(struct tcf_gact *gact)
+{
+ u32 pack = atomic_inc_return(&gact->packets);
+
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (pack % gact->tcfg_pval)
+ return gact->tcf_action;
+ return gact->tcfg_paction;
+}
+
+typedef int (*g_rand)(struct tcf_gact *gact);
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
+#endif /* CONFIG_GACT_PROB */
+
+static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
+ [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) },
+ [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
+};
+
+static int tcf_gact_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+ struct nlattr *tb[TCA_GACT_MAX + 1];
+ struct tc_gact *parm;
+ struct tcf_gact *gact;
+ int ret = 0;
+ u32 index;
+ int err;
+#ifdef CONFIG_GACT_PROB
+ struct tc_gact_p *p_parm = NULL;
+#endif
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GACT_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_GACT_PARMS]);
+ index = parm->index;
+
+#ifndef CONFIG_GACT_PROB
+ if (tb[TCA_GACT_PROB] != NULL)
+ return -EOPNOTSUPP;
+#else
+ if (tb[TCA_GACT_PROB]) {
+ p_parm = nla_data(tb[TCA_GACT_PROB]);
+ if (p_parm->ptype >= MAX_RAND)
+ return -EINVAL;
+ }
+#endif
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_gact_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind)/* dont override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ gact = to_gact(*a);
+
+ spin_lock_bh(&gact->tcf_lock);
+ gact->tcf_action = parm->action;
+#ifdef CONFIG_GACT_PROB
+ if (p_parm) {
+ gact->tcfg_paction = p_parm->paction;
+ gact->tcfg_pval = max_t(u16, 1, p_parm->pval);
+ /* Make sure tcfg_pval is written before tcfg_ptype
+ * coupled with smp_rmb() in gact_net_rand() & gact_determ()
+ */
+ smp_wmb();
+ gact->tcfg_ptype = p_parm->ptype;
+ }
+#endif
+ spin_unlock_bh(&gact->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_gact *gact = to_gact(a);
+ int action = READ_ONCE(gact->tcf_action);
+
+#ifdef CONFIG_GACT_PROB
+ {
+ u32 ptype = READ_ONCE(gact->tcfg_ptype);
+
+ if (ptype)
+ action = gact_rand[ptype](gact);
+ }
+#endif
+ bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+ if (action == TC_ACT_SHOT)
+ qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+
+ tcf_lastuse_update(&gact->tcf_tm);
+
+ return action;
+}
+
+static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ struct tcf_gact *gact = to_gact(a);
+ int action = READ_ONCE(gact->tcf_action);
+ struct tcf_t *tm = &gact->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
+ packets);
+ if (action == TC_ACT_SHOT)
+ this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
+
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_gact *gact = to_gact(a);
+ struct tc_gact opt = {
+ .index = gact->tcf_index,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&gact->tcf_lock);
+ opt.action = gact->tcf_action;
+ if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+#ifdef CONFIG_GACT_PROB
+ if (gact->tcfg_ptype) {
+ struct tc_gact_p p_opt = {
+ .paction = gact->tcfg_paction,
+ .pval = gact->tcfg_pval,
+ .ptype = gact->tcfg_ptype,
+ };
+
+ if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
+ goto nla_put_failure;
+ }
+#endif
+ tcf_tm_dump(&t, &gact->tcf_tm);
+ if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&gact->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&gact->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static size_t tcf_gact_get_fill_size(const struct tc_action *act)
+{
+ size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */
+
+#ifdef CONFIG_GACT_PROB
+ if (to_gact(act)->tcfg_ptype)
+ /* TCA_GACT_PROB */
+ sz += nla_total_size(sizeof(struct tc_gact_p));
+#endif
+
+ return sz;
+}
+
+static struct tc_action_ops act_gact_ops = {
+ .kind = "gact",
+ .type = TCA_ACT_GACT,
+ .owner = THIS_MODULE,
+ .act = tcf_gact_act,
+ .stats_update = tcf_gact_stats_update,
+ .dump = tcf_gact_dump,
+ .init = tcf_gact_init,
+ .walk = tcf_gact_walker,
+ .lookup = tcf_gact_search,
+ .get_fill_size = tcf_gact_get_fill_size,
+ .size = sizeof(struct tcf_gact),
+};
+
+static __net_init int gact_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tc_action_net_init(net, tn, &act_gact_ops);
+}
+
+static void __net_exit gact_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, gact_net_id);
+}
+
+static struct pernet_operations gact_net_ops = {
+ .init = gact_init_net,
+ .exit_batch = gact_exit_net,
+ .id = &gact_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Classifier actions");
+MODULE_LICENSE("GPL");
+
+static int __init gact_init_module(void)
+{
+#ifdef CONFIG_GACT_PROB
+ pr_info("GACT probability on\n");
+#else
+ pr_info("GACT probability NOT on\n");
+#endif
+
+ return tcf_register_action(&act_gact_ops, &gact_net_ops);
+}
+
+static void __exit gact_cleanup_module(void)
+{
+ tcf_unregister_action(&act_gact_ops, &gact_net_ops);
+}
+
+module_init(gact_init_module);
+module_exit(gact_cleanup_module);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000..bac353bea
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,920 @@
+/*
+ * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB
+ *
+ * Refer to:
+ * draft-ietf-forces-interfelfb-03
+ * and
+ * netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action
+ * Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+static unsigned int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+static struct tc_action_ops act_ife_ops;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+ [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+ [TCA_IFE_DMAC] = { .len = ETH_ALEN},
+ [TCA_IFE_SMAC] = { .len = ETH_ALEN},
+ [TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u16 edata = 0;
+
+ if (mi->metaval)
+ edata = *(u16 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htons(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+ if (mi->metaval)
+ return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+ else
+ return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+V == 2+2+4 */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u32 edata = metaval;
+
+ if (mi->metaval)
+ edata = *(u32 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htonl(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+ if (mi->metaval)
+ return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+ else
+ return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
+{
+ mi->metaval = kmemdup(metaval, sizeof(u32), gfp);
+ if (!mi->metaval)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
+{
+ mi->metaval = kmemdup(metaval, sizeof(u16), gfp);
+ if (!mi->metaval)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+ kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+ if (len == sizeof(u32))
+ return 0;
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+ /* length will not include padding */
+ if (len == sizeof(u16))
+ return 0;
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+ struct tcf_meta_ops *o;
+
+ read_lock(&ife_mod_lock);
+ list_for_each_entry(o, &ifeoplist, list) {
+ if (o->metaid == metaid) {
+ if (!try_module_get(o->owner))
+ o = NULL;
+ read_unlock(&ife_mod_lock);
+ return o;
+ }
+ }
+ read_unlock(&ife_mod_lock);
+
+ return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+ struct tcf_meta_ops *m;
+
+ if (!mops->metaid || !mops->metatype || !mops->name ||
+ !mops->check_presence || !mops->encode || !mops->decode ||
+ !mops->get || !mops->alloc)
+ return -EINVAL;
+
+ write_lock(&ife_mod_lock);
+
+ list_for_each_entry(m, &ifeoplist, list) {
+ if (m->metaid == mops->metaid ||
+ (strcmp(mops->name, m->name) == 0)) {
+ write_unlock(&ife_mod_lock);
+ return -EEXIST;
+ }
+ }
+
+ if (!mops->release)
+ mops->release = ife_release_meta_gen;
+
+ list_add_tail(&mops->list, &ifeoplist);
+ write_unlock(&ife_mod_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+ struct tcf_meta_ops *m;
+ int err = -ENOENT;
+
+ write_lock(&ife_mod_lock);
+ list_for_each_entry(m, &ifeoplist, list) {
+ if (m->metaid == mops->metaid) {
+ list_del(&mops->list);
+ err = 0;
+ break;
+ }
+ }
+ write_unlock(&ife_mod_lock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+ int ret = 0;
+ /* XXX: unfortunately cant use nla_policy at this point
+ * because a length of 0 is valid in the case of
+ * "allow". "use" semantics do enforce for proper
+ * length and i couldve use nla_policy but it makes it hard
+ * to use it just for that..
+ */
+ if (ops->validate)
+ return ops->validate(val, len);
+
+ if (ops->metatype == NLA_U32)
+ ret = ife_validate_meta_u32(val, len);
+ else if (ops->metatype == NLA_U16)
+ ret = ife_validate_meta_u16(val, len);
+
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+static const char *ife_meta_id2name(u32 metaid)
+{
+ switch (metaid) {
+ case IFE_META_SKBMARK:
+ return "skbmark";
+ case IFE_META_PRIO:
+ return "skbprio";
+ case IFE_META_TCINDEX:
+ return "tcindex";
+ default:
+ return "unknown";
+ }
+}
+#endif
+
+/* called when adding new meta information
+*/
+static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held)
+{
+ struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+ int ret = 0;
+
+ if (!ops) {
+ ret = -ENOENT;
+#ifdef CONFIG_MODULES
+ if (rtnl_held)
+ rtnl_unlock();
+ request_module("ife-meta-%s", ife_meta_id2name(metaid));
+ if (rtnl_held)
+ rtnl_lock();
+ ops = find_ife_oplist(metaid);
+#endif
+ }
+
+ if (ops) {
+ ret = 0;
+ if (len)
+ ret = ife_validate_metatype(ops, val, len);
+
+ module_put(ops->owner);
+ }
+
+ return ret;
+}
+
+/* called when adding new meta information
+*/
+static int __add_metainfo(const struct tcf_meta_ops *ops,
+ struct tcf_ife_info *ife, u32 metaid, void *metaval,
+ int len, bool atomic, bool exists)
+{
+ struct tcf_meta_info *mi = NULL;
+ int ret = 0;
+
+ mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
+ if (!mi)
+ return -ENOMEM;
+
+ mi->metaid = metaid;
+ mi->ops = ops;
+ if (len > 0) {
+ ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
+ if (ret != 0) {
+ kfree(mi);
+ return ret;
+ }
+ }
+
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
+ list_add_tail(&mi->metalist, &ife->metalist);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
+
+ return ret;
+}
+
+static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops,
+ struct tcf_ife_info *ife, u32 metaid,
+ bool exists)
+{
+ int ret;
+
+ if (!try_module_get(ops->owner))
+ return -ENOENT;
+ ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists);
+ if (ret)
+ module_put(ops->owner);
+ return ret;
+}
+
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+ int len, bool exists)
+{
+ const struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+ int ret;
+
+ if (!ops)
+ return -ENOENT;
+ ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists);
+ if (ret)
+ /*put back what find_ife_oplist took */
+ module_put(ops->owner);
+ return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
+{
+ struct tcf_meta_ops *o;
+ int rc = 0;
+ int installed = 0;
+
+ read_lock(&ife_mod_lock);
+ list_for_each_entry(o, &ifeoplist, list) {
+ rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists);
+ if (rc == 0)
+ installed += 1;
+ }
+ read_unlock(&ife_mod_lock);
+
+ if (installed)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+ struct tcf_meta_info *e;
+ struct nlattr *nest;
+ unsigned char *b = skb_tail_pointer(skb);
+ int total_encoded = 0;
+
+ /*can only happen on decode */
+ if (list_empty(&ife->metalist))
+ return 0;
+
+ nest = nla_nest_start(skb, TCA_IFE_METALST);
+ if (!nest)
+ goto out_nlmsg_trim;
+
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (!e->ops->get(skb, e))
+ total_encoded += 1;
+ }
+
+ if (!total_encoded)
+ goto out_nlmsg_trim;
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_nlmsg_trim:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a)
+{
+ struct tcf_ife_info *ife = to_ife(a);
+ struct tcf_meta_info *e, *n;
+
+ list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ list_del(&e->metalist);
+ if (e->metaval) {
+ if (e->ops->release)
+ e->ops->release(e);
+ else
+ kfree(e->metaval);
+ }
+ module_put(e->ops->owner);
+ kfree(e);
+ }
+}
+
+static void tcf_ife_cleanup(struct tc_action *a)
+{
+ struct tcf_ife_info *ife = to_ife(a);
+ struct tcf_ife_params *p;
+
+ spin_lock_bh(&ife->tcf_lock);
+ _tcf_ife_cleanup(a);
+ spin_unlock_bh(&ife->tcf_lock);
+
+ p = rcu_dereference_protected(ife->params, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
+ bool exists, bool rtnl_held)
+{
+ int len = 0;
+ int rc = 0;
+ int i = 0;
+ void *val;
+
+ for (i = 1; i < max_metacnt; i++) {
+ if (tb[i]) {
+ val = nla_data(tb[i]);
+ len = nla_len(tb[i]);
+
+ rc = load_metaops_and_vet(i, val, len, rtnl_held);
+ if (rc != 0)
+ return rc;
+
+ rc = add_metainfo(ife, i, val, len, exists);
+ if (rc)
+ return rc;
+ }
+ }
+
+ return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+ struct nlattr *tb[TCA_IFE_MAX + 1];
+ struct nlattr *tb2[IFE_META_MAX + 1];
+ struct tcf_ife_params *p;
+ struct tcf_ife_info *ife;
+ u16 ife_type = ETH_P_IFE;
+ struct tc_ife *parm;
+ u8 *daddr = NULL;
+ u8 *saddr = NULL;
+ bool exists = false;
+ int ret = 0;
+ u32 index;
+ int err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_IFE_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_IFE_PARMS]);
+
+ /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because
+ * they cannot run as the same time. Check on all other values which
+ * are not supported right now.
+ */
+ if (parm->flags & ~IFE_ENCODE)
+ return -EINVAL;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0) {
+ kfree(p);
+ return err;
+ }
+ exists = err;
+ if (exists && bind) {
+ kfree(p);
+ return 0;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
+ bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ kfree(p);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ kfree(p);
+ return -EEXIST;
+ }
+
+ ife = to_ife(*a);
+ p->flags = parm->flags;
+
+ if (parm->flags & IFE_ENCODE) {
+ if (tb[TCA_IFE_TYPE])
+ ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+ if (tb[TCA_IFE_DMAC])
+ daddr = nla_data(tb[TCA_IFE_DMAC]);
+ if (tb[TCA_IFE_SMAC])
+ saddr = nla_data(tb[TCA_IFE_SMAC]);
+ }
+
+ if (parm->flags & IFE_ENCODE) {
+ if (daddr)
+ ether_addr_copy(p->eth_dst, daddr);
+ else
+ eth_zero_addr(p->eth_dst);
+
+ if (saddr)
+ ether_addr_copy(p->eth_src, saddr);
+ else
+ eth_zero_addr(p->eth_src);
+
+ p->eth_type = ife_type;
+ }
+
+
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&ife->metalist);
+
+ if (tb[TCA_IFE_METALST]) {
+ err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+ NULL, NULL);
+ if (err) {
+metadata_parse_err:
+ tcf_idr_release(*a, bind);
+ kfree(p);
+ return err;
+ }
+
+ err = populate_metalist(ife, tb2, exists, rtnl_held);
+ if (err)
+ goto metadata_parse_err;
+
+ } else {
+ /* if no passed metadata allow list or passed allow-all
+ * then here we process by adding as many supported metadatum
+ * as we can. You better have at least one else we are
+ * going to bail out
+ */
+ err = use_all_metadata(ife, exists);
+ if (err) {
+ tcf_idr_release(*a, bind);
+ kfree(p);
+ return err;
+ }
+ }
+
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
+ ife->tcf_action = parm->action;
+ /* protected by tcf_lock when modifying existing action */
+ rcu_swap_protected(ife->params, p, 1);
+
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
+ if (p)
+ kfree_rcu(p, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ife_info *ife = to_ife(a);
+ struct tcf_ife_params *p;
+ struct tc_ife opt = {
+ .index = ife->tcf_index,
+ .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&ife->tcf_lock);
+ opt.action = ife->tcf_action;
+ p = rcu_dereference_protected(ife->params,
+ lockdep_is_held(&ife->tcf_lock));
+ opt.flags = p->flags;
+
+ if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &ife->tcf_tm);
+ if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
+ goto nla_put_failure;
+
+ if (!is_zero_ether_addr(p->eth_dst)) {
+ if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst))
+ goto nla_put_failure;
+ }
+
+ if (!is_zero_ether_addr(p->eth_src)) {
+ if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type))
+ goto nla_put_failure;
+
+ if (dump_metalist(skb, ife)) {
+ /*ignore failure to dump metalist */
+ pr_info("Failed to dump metalist\n");
+ }
+
+ spin_unlock_bh(&ife->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ife->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+ u16 metaid, u16 mlen, void *mdata)
+{
+ struct tcf_meta_info *e;
+
+ /* XXX: use hash to speed up */
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (metaid == e->metaid) {
+ if (e->ops) {
+ /* We check for decode presence already */
+ return e->ops->decode(skb, mdata, mlen);
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_ife_info *ife = to_ife(a);
+ int action = ife->tcf_action;
+ u8 *ifehdr_end;
+ u8 *tlv_data;
+ u16 metalen;
+
+ bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
+ tcf_lastuse_update(&ife->tcf_tm);
+
+ if (skb_at_tc_ingress(skb))
+ skb_push(skb, skb->dev->hard_header_len);
+
+ tlv_data = ife_decode(skb, &metalen);
+ if (unlikely(!tlv_data)) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ ifehdr_end = tlv_data + metalen;
+ for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+ u8 *curr_data;
+ u16 mtype;
+ u16 dlen;
+
+ curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype,
+ &dlen, NULL);
+ if (!curr_data) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
+ /* abuse overlimits to count when we receive metadata
+ * but dont have an ops for it
+ */
+ pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+ mtype, dlen);
+ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ }
+ }
+
+ if (WARN_ON(tlv_data != ifehdr_end)) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb_reset_network_header(skb);
+
+ return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+ struct tcf_meta_info *e, *n;
+ int tot_run_sz = 0, run_sz = 0;
+
+ list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ if (e->ops->check_presence) {
+ run_sz = e->ops->check_presence(skb, e);
+ tot_run_sz += run_sz;
+ }
+ }
+
+ return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res, struct tcf_ife_params *p)
+{
+ struct tcf_ife_info *ife = to_ife(a);
+ int action = ife->tcf_action;
+ struct ethhdr *oethh; /* outer ether header */
+ struct tcf_meta_info *e;
+ /*
+ OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+ where ORIGDATA = original ethernet header ...
+ */
+ u16 metalen = ife_get_sz(skb, ife);
+ int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+ unsigned int skboff = 0;
+ int new_len = skb->len + hdrm;
+ bool exceed_mtu = false;
+ void *ife_meta;
+ int err = 0;
+
+ if (!skb_at_tc_ingress(skb)) {
+ if (new_len > skb->dev->mtu)
+ exceed_mtu = true;
+ }
+
+ bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
+ tcf_lastuse_update(&ife->tcf_tm);
+
+ if (!metalen) { /* no metadata to send */
+ /* abuse overlimits to count when we allow packet
+ * with no metadata
+ */
+ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return action;
+ }
+ /* could be stupid policy setup or mtu config
+ * so lets be conservative.. */
+ if ((action == TC_ACT_SHOT) || exceed_mtu) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ if (skb_at_tc_ingress(skb))
+ skb_push(skb, skb->dev->hard_header_len);
+
+ ife_meta = ife_encode(skb, metalen);
+
+ spin_lock(&ife->tcf_lock);
+
+ /* XXX: we dont have a clever way of telling encode to
+ * not repeat some of the computations that are done by
+ * ops->presence_check...
+ */
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (e->ops->encode) {
+ err = e->ops->encode(skb, (void *)(ife_meta + skboff),
+ e);
+ }
+ if (err < 0) {
+ /* too corrupt to keep around if overwritten */
+ spin_unlock(&ife->tcf_lock);
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+ skboff += err;
+ }
+ spin_unlock(&ife->tcf_lock);
+ oethh = (struct ethhdr *)skb->data;
+
+ if (!is_zero_ether_addr(p->eth_src))
+ ether_addr_copy(oethh->h_source, p->eth_src);
+ if (!is_zero_ether_addr(p->eth_dst))
+ ether_addr_copy(oethh->h_dest, p->eth_dst);
+ oethh->h_proto = htons(p->eth_type);
+
+ if (skb_at_tc_ingress(skb))
+ skb_pull(skb, skb->dev->hard_header_len);
+
+ return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_ife_info *ife = to_ife(a);
+ struct tcf_ife_params *p;
+ int ret;
+
+ p = rcu_dereference_bh(ife->params);
+ if (p->flags & IFE_ENCODE) {
+ ret = tcf_ife_encode(skb, a, res, p);
+ return ret;
+ }
+
+ return tcf_ife_decode(skb, a, res);
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+ .kind = "ife",
+ .type = TCA_ACT_IFE,
+ .owner = THIS_MODULE,
+ .act = tcf_ife_act,
+ .dump = tcf_ife_dump,
+ .cleanup = tcf_ife_cleanup,
+ .init = tcf_ife_init,
+ .walk = tcf_ife_walker,
+ .lookup = tcf_ife_search,
+ .size = sizeof(struct tcf_ife_info),
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tc_action_net_init(net, tn, &act_ife_ops);
+}
+
+static void __net_exit ife_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ife_net_id);
+}
+
+static struct pernet_operations ife_net_ops = {
+ .init = ife_init_net,
+ .exit_batch = ife_exit_net,
+ .id = &ife_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+ return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
new file mode 100644
index 000000000..01d3669ef
--- /dev/null
+++ b/net/sched/act_ipt.c
@@ -0,0 +1,452 @@
+/*
+ * net/sched/act_ipt.c iptables target interface
+ *
+ *TODO: Add other tables. For now we only support the ipv4 table targets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Copyright: Jamal Hadi Salim (2002-13)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_ipt.h>
+#include <net/tc_act/tc_ipt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+
+static unsigned int ipt_net_id;
+static struct tc_action_ops act_ipt_ops;
+
+static unsigned int xt_net_id;
+static struct tc_action_ops act_xt_ops;
+
+static int ipt_init_target(struct net *net, struct xt_entry_target *t,
+ char *table, unsigned int hook)
+{
+ struct xt_tgchk_param par;
+ struct xt_target *target;
+ struct ipt_entry e = {};
+ int ret = 0;
+
+ target = xt_request_find_target(AF_INET, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target))
+ return PTR_ERR(target);
+
+ t->u.kernel.target = target;
+ memset(&par, 0, sizeof(par));
+ par.net = net;
+ par.table = table;
+ par.entryinfo = &e;
+ par.target = target;
+ par.targinfo = t->data;
+ par.hook_mask = hook;
+ par.family = NFPROTO_IPV4;
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+ if (ret < 0) {
+ module_put(t->u.kernel.target->me);
+ return ret;
+ }
+ return 0;
+}
+
+static void ipt_destroy_target(struct xt_entry_target *t, struct net *net)
+{
+ struct xt_tgdtor_param par = {
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .family = NFPROTO_IPV4,
+ .net = net,
+ };
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+static void tcf_ipt_release(struct tc_action *a)
+{
+ struct tcf_ipt *ipt = to_ipt(a);
+
+ if (ipt->tcfi_t) {
+ ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net);
+ kfree(ipt->tcfi_t);
+ }
+ kfree(ipt->tcfi_tname);
+}
+
+static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
+ [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_IPT_HOOK] = { .type = NLA_U32 },
+ [TCA_IPT_INDEX] = { .type = NLA_U32 },
+ [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
+};
+
+static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, id);
+ struct nlattr *tb[TCA_IPT_MAX + 1];
+ struct tcf_ipt *ipt;
+ struct xt_entry_target *td, *t;
+ char *tname;
+ bool exists = false;
+ int ret = 0, err;
+ u32 hook = 0;
+ u32 index = 0;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_IPT_INDEX] != NULL)
+ index = nla_get_u32(tb[TCA_IPT_INDEX]);
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+
+ td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+ if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a, ops, bind,
+ false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)/* dont override defaults */
+ return 0;
+
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+ hook = nla_get_u32(tb[TCA_IPT_HOOK]);
+
+ err = -ENOMEM;
+ tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
+ if (unlikely(!tname))
+ goto err1;
+ if (tb[TCA_IPT_TABLE] == NULL ||
+ nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
+ strcpy(tname, "mangle");
+
+ t = kmemdup(td, td->u.target_size, GFP_KERNEL);
+ if (unlikely(!t))
+ goto err2;
+
+ err = ipt_init_target(net, t, tname, hook);
+ if (err < 0)
+ goto err3;
+
+ ipt = to_ipt(*a);
+
+ spin_lock_bh(&ipt->tcf_lock);
+ if (ret != ACT_P_CREATED) {
+ ipt_destroy_target(ipt->tcfi_t, net);
+ kfree(ipt->tcfi_tname);
+ kfree(ipt->tcfi_t);
+ }
+ ipt->tcfi_tname = tname;
+ ipt->tcfi_t = t;
+ ipt->tcfi_hook = hook;
+ spin_unlock_bh(&ipt->tcf_lock);
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+
+err3:
+ kfree(t);
+err2:
+ kfree(tname);
+err1:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ipt_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a, int ovr,
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
+ bind);
+}
+
+static int tcf_xt_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a, int ovr,
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
+{
+ return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
+ bind);
+}
+
+static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ int ret = 0, result = 0;
+ struct tcf_ipt *ipt = to_ipt(a);
+ struct xt_action_param par;
+ struct nf_hook_state state = {
+ .net = dev_net(skb->dev),
+ .in = skb->dev,
+ .hook = ipt->tcfi_hook,
+ .pf = NFPROTO_IPV4,
+ };
+
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return TC_ACT_UNSPEC;
+
+ spin_lock(&ipt->tcf_lock);
+
+ tcf_lastuse_update(&ipt->tcf_tm);
+ bstats_update(&ipt->tcf_bstats, skb);
+
+ /* yes, we have to worry about both in and out dev
+ * worry later - danger - this API seems to have changed
+ * from earlier kernels
+ */
+ par.state = &state;
+ par.target = ipt->tcfi_t->u.kernel.target;
+ par.targinfo = ipt->tcfi_t->data;
+ ret = par.target->target(skb, &par);
+
+ switch (ret) {
+ case NF_ACCEPT:
+ result = TC_ACT_OK;
+ break;
+ case NF_DROP:
+ result = TC_ACT_SHOT;
+ ipt->tcf_qstats.drops++;
+ break;
+ case XT_CONTINUE:
+ result = TC_ACT_PIPE;
+ break;
+ default:
+ net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
+ ret);
+ result = TC_ACT_OK;
+ break;
+ }
+ spin_unlock(&ipt->tcf_lock);
+ return result;
+
+}
+
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ipt *ipt = to_ipt(a);
+ struct xt_entry_target *t;
+ struct tcf_t tm;
+ struct tc_cnt c;
+
+ /* for simple targets kernel size == user size
+ * user name = target name
+ * for foolproof you need to not assume this
+ */
+
+ spin_lock_bh(&ipt->tcf_lock);
+ t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
+ if (unlikely(!t))
+ goto nla_put_failure;
+
+ c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
+ c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
+ strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
+
+ if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
+ nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
+ nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
+ nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
+ nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&tm, &ipt->tcf_tm);
+ if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ipt->tcf_lock);
+ kfree(t);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ipt->tcf_lock);
+ nlmsg_trim(skb, b);
+ kfree(t);
+ return -1;
+}
+
+static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ipt_ops = {
+ .kind = "ipt",
+ .type = TCA_ACT_IPT,
+ .owner = THIS_MODULE,
+ .act = tcf_ipt_act,
+ .dump = tcf_ipt_dump,
+ .cleanup = tcf_ipt_release,
+ .init = tcf_ipt_init,
+ .walk = tcf_ipt_walker,
+ .lookup = tcf_ipt_search,
+ .size = sizeof(struct tcf_ipt),
+};
+
+static __net_init int ipt_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tc_action_net_init(net, tn, &act_ipt_ops);
+}
+
+static void __net_exit ipt_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ipt_net_id);
+}
+
+static struct pernet_operations ipt_net_ops = {
+ .init = ipt_init_net,
+ .exit_batch = ipt_exit_net,
+ .id = &ipt_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_xt_ops = {
+ .kind = "xt",
+ .type = TCA_ACT_XT,
+ .owner = THIS_MODULE,
+ .act = tcf_ipt_act,
+ .dump = tcf_ipt_dump,
+ .cleanup = tcf_ipt_release,
+ .init = tcf_xt_init,
+ .walk = tcf_xt_walker,
+ .lookup = tcf_xt_search,
+ .size = sizeof(struct tcf_ipt),
+};
+
+static __net_init int xt_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tc_action_net_init(net, tn, &act_xt_ops);
+}
+
+static void __net_exit xt_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, xt_net_id);
+}
+
+static struct pernet_operations xt_net_ops = {
+ .init = xt_init_net,
+ .exit_batch = xt_exit_net,
+ .id = &xt_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
+MODULE_DESCRIPTION("Iptables target actions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("act_xt");
+
+static int __init ipt_init_module(void)
+{
+ int ret1, ret2;
+
+ ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops);
+ if (ret1 < 0)
+ pr_err("Failed to load xt action\n");
+
+ ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops);
+ if (ret2 < 0)
+ pr_err("Failed to load ipt action\n");
+
+ if (ret1 < 0 && ret2 < 0) {
+ return ret1;
+ } else
+ return 0;
+}
+
+static void __exit ipt_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ipt_ops, &ipt_net_ops);
+ tcf_unregister_action(&act_xt_ops, &xt_net_ops);
+}
+
+module_init(ipt_init_module);
+module_exit(ipt_cleanup_module);
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
new file mode 100644
index 000000000..6445184b2
--- /dev/null
+++ b/net/sched/act_meta_mark.c
@@ -0,0 +1,78 @@
+/*
+ * net/sched/act_meta_mark.c IFE skb->mark metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+
+static int skbmark_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifemark = skb->mark;
+
+ return ife_encode_meta_u32(ifemark, skbdata, e);
+}
+
+static int skbmark_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u32 ifemark = *(u32 *)data;
+
+ skb->mark = ntohl(ifemark);
+ return 0;
+}
+
+static int skbmark_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u32(skb->mark, e);
+}
+
+static struct tcf_meta_ops ife_skbmark_ops = {
+ .metaid = IFE_META_SKBMARK,
+ .metatype = NLA_U32,
+ .name = "skbmark",
+ .synopsis = "skb mark 32 bit metadata",
+ .check_presence = skbmark_check,
+ .encode = skbmark_encode,
+ .decode = skbmark_decode,
+ .get = ife_get_meta_u32,
+ .alloc = ife_alloc_meta_u32,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u32,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifemark_init_module(void)
+{
+ return register_ife_op(&ife_skbmark_ops);
+}
+
+static void __exit ifemark_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbmark_ops);
+}
+
+module_init(ifemark_init_module);
+module_exit(ifemark_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb mark metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META("skbmark");
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
new file mode 100644
index 000000000..4033f9fc4
--- /dev/null
+++ b/net/sched/act_meta_skbprio.c
@@ -0,0 +1,76 @@
+/*
+ * net/sched/act_meta_prio.c IFE skb->priority metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+
+static int skbprio_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u32(skb->priority, e);
+}
+
+static int skbprio_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifeprio = skb->priority; /* avoid having to cast skb->priority*/
+
+ return ife_encode_meta_u32(ifeprio, skbdata, e);
+}
+
+static int skbprio_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u32 ifeprio = *(u32 *)data;
+
+ skb->priority = ntohl(ifeprio);
+ return 0;
+}
+
+static struct tcf_meta_ops ife_prio_ops = {
+ .metaid = IFE_META_PRIO,
+ .metatype = NLA_U32,
+ .name = "skbprio",
+ .synopsis = "skb prio metadata",
+ .check_presence = skbprio_check,
+ .encode = skbprio_encode,
+ .decode = skbprio_decode,
+ .get = ife_get_meta_u32,
+ .alloc = ife_alloc_meta_u32,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifeprio_init_module(void)
+{
+ return register_ife_op(&ife_prio_ops);
+}
+
+static void __exit ifeprio_cleanup_module(void)
+{
+ unregister_ife_op(&ife_prio_ops);
+}
+
+module_init(ifeprio_init_module);
+module_exit(ifeprio_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb prio metadata action");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META("skbprio");
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000..7221437ca
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,78 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifetc_index = skb->tc_index;
+
+ return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u16 ifetc_index = *(u16 *)data;
+
+ skb->tc_index = ntohs(ifetc_index);
+ return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+ .metaid = IFE_META_TCINDEX,
+ .metatype = NLA_U16,
+ .name = "tc_index",
+ .synopsis = "skb tc_index 16 bit metadata",
+ .check_presence = skbtcindex_check,
+ .encode = skbtcindex_encode,
+ .decode = skbtcindex_decode,
+ .get = ife_get_meta_u16,
+ .alloc = ife_alloc_meta_u16,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u16,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+ return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META("tcindex");
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
new file mode 100644
index 000000000..a30c17a28
--- /dev/null
+++ b/net/sched/act_mirred.c
@@ -0,0 +1,462 @@
+/*
+ * net/sched/act_mirred.c packet mirroring and redirect actions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2002-4)
+ *
+ * TODO: Add ingress support (and socket redirect support)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/if_arp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_mirred.h>
+
+static LIST_HEAD(mirred_list);
+static DEFINE_SPINLOCK(mirred_list_lock);
+
+static bool tcf_mirred_is_act_redirect(int action)
+{
+ return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
+}
+
+static bool tcf_mirred_act_wants_ingress(int action)
+{
+ switch (action) {
+ case TCA_EGRESS_REDIR:
+ case TCA_EGRESS_MIRROR:
+ return false;
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
+ return true;
+ default:
+ BUG();
+ }
+}
+
+static bool tcf_mirred_can_reinsert(int action)
+{
+ switch (action) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ return true;
+ }
+ return false;
+}
+
+static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
+{
+ return rcu_dereference_protected(m->tcfm_dev,
+ lockdep_is_held(&m->tcf_lock));
+}
+
+static void tcf_mirred_release(struct tc_action *a)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev;
+
+ spin_lock(&mirred_list_lock);
+ list_del(&m->tcfm_list);
+ spin_unlock(&mirred_list_lock);
+
+ /* last reference to action, no need to lock */
+ dev = rcu_dereference_protected(m->tcfm_dev, 1);
+ if (dev)
+ dev_put(dev);
+}
+
+static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
+ [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
+};
+
+static unsigned int mirred_net_id;
+static struct tc_action_ops act_mirred_ops;
+
+static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+ struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ bool mac_header_xmit = false;
+ struct tc_mirred *parm;
+ struct tcf_mirred *m;
+ struct net_device *dev;
+ bool exists = false;
+ int ret, err;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
+ return -EINVAL;
+ }
+ ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack);
+ if (ret < 0)
+ return ret;
+ if (!tb[TCA_MIRRED_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_MIRRED_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ switch (parm->eaction) {
+ case TCA_EGRESS_MIRROR:
+ case TCA_EGRESS_REDIR:
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
+ break;
+ default:
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
+ return -EINVAL;
+ }
+
+ if (!exists) {
+ if (!parm->ifindex) {
+ tcf_idr_cleanup(tn, index);
+ NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
+ return -EINVAL;
+ }
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_mirred_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ m = to_mirred(*a);
+
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&m->tcfm_list);
+
+ spin_lock_bh(&m->tcf_lock);
+ m->tcf_action = parm->action;
+ m->tcfm_eaction = parm->eaction;
+
+ if (parm->ifindex) {
+ dev = dev_get_by_index(net, parm->ifindex);
+ if (!dev) {
+ spin_unlock_bh(&m->tcf_lock);
+ tcf_idr_release(*a, bind);
+ return -ENODEV;
+ }
+ mac_header_xmit = dev_is_mac_header_xmit(dev);
+ rcu_swap_protected(m->tcfm_dev, dev,
+ lockdep_is_held(&m->tcf_lock));
+ if (dev)
+ dev_put(dev);
+ m->tcfm_mac_header_xmit = mac_header_xmit;
+ }
+ spin_unlock_bh(&m->tcf_lock);
+
+ if (ret == ACT_P_CREATED) {
+ spin_lock(&mirred_list_lock);
+ list_add(&m->tcfm_list, &mirred_list);
+ spin_unlock(&mirred_list_lock);
+
+ tcf_idr_insert(tn, *a);
+ }
+
+ return ret;
+}
+
+static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ struct sk_buff *skb2 = skb;
+ bool m_mac_header_xmit;
+ struct net_device *dev;
+ int retval, err = 0;
+ bool use_reinsert;
+ bool want_ingress;
+ bool is_redirect;
+ int m_eaction;
+ int mac_len;
+
+ tcf_lastuse_update(&m->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+ m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
+ m_eaction = READ_ONCE(m->tcfm_eaction);
+ retval = READ_ONCE(m->tcf_action);
+ dev = rcu_dereference_bh(m->tcfm_dev);
+ if (unlikely(!dev)) {
+ pr_notice_once("tc mirred: target device is gone\n");
+ goto out;
+ }
+
+ if (unlikely(!(dev->flags & IFF_UP))) {
+ net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
+ dev->name);
+ goto out;
+ }
+
+ /* we could easily avoid the clone only if called by ingress and clsact;
+ * since we can't easily detect the clsact caller, skip clone only for
+ * ingress - that covers the TC S/W datapath.
+ */
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
+ use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
+ tcf_mirred_can_reinsert(retval);
+ if (!use_reinsert) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out;
+ }
+
+ /* If action's target direction differs than filter's direction,
+ * and devices expect a mac header on xmit, then mac push/pull is
+ * needed.
+ */
+ want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+ if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
+ if (!skb_at_tc_ingress(skb)) {
+ /* caught at egress, act ingress: pull mac */
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ skb_pull_rcsum(skb2, mac_len);
+ } else {
+ /* caught at ingress, act egress: push mac */
+ skb_push_rcsum(skb2, skb->mac_len);
+ }
+ }
+
+ skb2->skb_iif = skb->dev->ifindex;
+ skb2->dev = dev;
+
+ /* mirror is always swallowed */
+ if (is_redirect) {
+ skb2->tc_redirected = 1;
+ skb2->tc_from_ingress = skb2->tc_at_ingress;
+
+ /* let's the caller reinsert the packet, if possible */
+ if (use_reinsert) {
+ res->ingress = want_ingress;
+ res->qstats = this_cpu_ptr(m->common.cpu_qstats);
+ return TC_ACT_REINSERT;
+ }
+ }
+
+ if (!want_ingress)
+ err = dev_queue_xmit(skb2);
+ else
+ err = netif_receive_skb(skb2);
+
+ if (err) {
+out:
+ qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
+ if (tcf_mirred_is_act_redirect(m_eaction))
+ retval = TC_ACT_SHOT;
+ }
+
+ return retval;
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ struct tcf_t *tm = &m->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_mirred *m = to_mirred(a);
+ struct tc_mirred opt = {
+ .index = m->tcf_index,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
+ };
+ struct net_device *dev;
+ struct tcf_t t;
+
+ spin_lock_bh(&m->tcf_lock);
+ opt.action = m->tcf_action;
+ opt.eaction = m->tcfm_eaction;
+ dev = tcf_mirred_dev_dereference(m);
+ if (dev)
+ opt.ifindex = dev->ifindex;
+
+ if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &m->tcf_tm);
+ if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&m->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&m->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static int mirred_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tcf_mirred *m;
+
+ ASSERT_RTNL();
+ if (event == NETDEV_UNREGISTER) {
+ spin_lock(&mirred_list_lock);
+ list_for_each_entry(m, &mirred_list, tcfm_list) {
+ spin_lock_bh(&m->tcf_lock);
+ if (tcf_mirred_dev_dereference(m) == dev) {
+ dev_put(dev);
+ /* Note : no rcu grace period necessary, as
+ * net_device are already rcu protected.
+ */
+ RCU_INIT_POINTER(m->tcfm_dev, NULL);
+ }
+ spin_unlock_bh(&m->tcf_lock);
+ }
+ spin_unlock(&mirred_list_lock);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mirred_device_notifier = {
+ .notifier_call = mirred_device_event,
+};
+
+static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = rcu_dereference(m->tcfm_dev);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ return dev;
+}
+
+static void tcf_mirred_put_dev(struct net_device *dev)
+{
+ dev_put(dev);
+}
+
+static struct tc_action_ops act_mirred_ops = {
+ .kind = "mirred",
+ .type = TCA_ACT_MIRRED,
+ .owner = THIS_MODULE,
+ .act = tcf_mirred_act,
+ .stats_update = tcf_stats_update,
+ .dump = tcf_mirred_dump,
+ .cleanup = tcf_mirred_release,
+ .init = tcf_mirred_init,
+ .walk = tcf_mirred_walker,
+ .lookup = tcf_mirred_search,
+ .size = sizeof(struct tcf_mirred),
+ .get_dev = tcf_mirred_get_dev,
+ .put_dev = tcf_mirred_put_dev,
+};
+
+static __net_init int mirred_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tc_action_net_init(net, tn, &act_mirred_ops);
+}
+
+static void __net_exit mirred_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, mirred_net_id);
+}
+
+static struct pernet_operations mirred_net_ops = {
+ .init = mirred_init_net,
+ .exit_batch = mirred_exit_net,
+ .id = &mirred_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002)");
+MODULE_DESCRIPTION("Device Mirror/redirect actions");
+MODULE_LICENSE("GPL");
+
+static int __init mirred_init_module(void)
+{
+ int err = register_netdevice_notifier(&mirred_device_notifier);
+ if (err)
+ return err;
+
+ pr_info("Mirror/redirect action on\n");
+ err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ if (err)
+ unregister_netdevice_notifier(&mirred_device_notifier);
+
+ return err;
+}
+
+static void __exit mirred_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mirred_ops, &mirred_net_ops);
+ unregister_netdevice_notifier(&mirred_device_notifier);
+}
+
+module_init(mirred_init_module);
+module_exit(mirred_cleanup_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
new file mode 100644
index 000000000..d1b47a1b1
--- /dev/null
+++ b/net/sched/act_nat.c
@@ -0,0 +1,349 @@
+/*
+ * Stateless NAT actions
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tc_act/tc_nat.h>
+#include <net/act_api.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/tc_act/tc_nat.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+
+static unsigned int nat_net_id;
+static struct tc_action_ops act_nat_ops;
+
+static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
+ [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
+};
+
+static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ struct tc_action **a, int ovr, int bind,
+ bool rtnl_held, struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+ struct nlattr *tb[TCA_NAT_MAX + 1];
+ struct tc_nat *parm;
+ int ret = 0, err;
+ struct tcf_nat *p;
+ u32 index;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_NAT_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_NAT_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_nat_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind)
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+ p = to_tcf_nat(*a);
+
+ spin_lock_bh(&p->tcf_lock);
+ p->old_addr = parm->old_addr;
+ p->new_addr = parm->new_addr;
+ p->mask = parm->mask;
+ p->flags = parm->flags;
+
+ p->tcf_action = parm->action;
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+}
+
+static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_nat *p = to_tcf_nat(a);
+ struct iphdr *iph;
+ __be32 old_addr;
+ __be32 new_addr;
+ __be32 mask;
+ __be32 addr;
+ int egress;
+ int action;
+ int ihl;
+ int noff;
+
+ spin_lock(&p->tcf_lock);
+
+ tcf_lastuse_update(&p->tcf_tm);
+ old_addr = p->old_addr;
+ new_addr = p->new_addr;
+ mask = p->mask;
+ egress = p->flags & TCA_NAT_FLAG_EGRESS;
+ action = p->tcf_action;
+
+ bstats_update(&p->tcf_bstats, skb);
+
+ spin_unlock(&p->tcf_lock);
+
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ noff = skb_network_offset(skb);
+ if (!pskb_may_pull(skb, sizeof(*iph) + noff))
+ goto drop;
+
+ iph = ip_hdr(skb);
+
+ if (egress)
+ addr = iph->saddr;
+ else
+ addr = iph->daddr;
+
+ if (!((old_addr ^ addr) & mask)) {
+ if (skb_try_make_writable(skb, sizeof(*iph) + noff))
+ goto drop;
+
+ new_addr &= mask;
+ new_addr |= addr & ~mask;
+
+ /* Rewrite IP header */
+ iph = ip_hdr(skb);
+ if (egress)
+ iph->saddr = new_addr;
+ else
+ iph->daddr = new_addr;
+
+ csum_replace4(&iph->check, addr, new_addr);
+ } else if ((iph->frag_off & htons(IP_OFFSET)) ||
+ iph->protocol != IPPROTO_ICMP) {
+ goto out;
+ }
+
+ ihl = iph->ihl * 4;
+
+ /* It would be nice to share code with stateful NAT. */
+ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+ case IPPROTO_TCP:
+ {
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
+ skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff))
+ goto drop;
+
+ tcph = (void *)(skb_network_header(skb) + ihl);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr,
+ true);
+ break;
+ }
+ case IPPROTO_UDP:
+ {
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
+ skb_try_make_writable(skb, ihl + sizeof(*udph) + noff))
+ goto drop;
+
+ udph = (void *)(skb_network_header(skb) + ihl);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+ break;
+ }
+ case IPPROTO_ICMP:
+ {
+ struct icmphdr *icmph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED) &&
+ (icmph->type != ICMP_PARAMETERPROB))
+ break;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
+ noff))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+ iph = (void *)(icmph + 1);
+ if (egress)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ if ((old_addr ^ addr) & mask)
+ break;
+
+ if (skb_try_make_writable(skb, ihl + sizeof(*icmph) +
+ sizeof(*iph) + noff))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+ iph = (void *)(icmph + 1);
+
+ new_addr &= mask;
+ new_addr |= addr & ~mask;
+
+ /* XXX Fix up the inner checksums. */
+ if (egress)
+ iph->daddr = new_addr;
+ else
+ iph->saddr = new_addr;
+
+ inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
+ false);
+ break;
+ }
+ default:
+ break;
+ }
+
+out:
+ return action;
+
+drop:
+ spin_lock(&p->tcf_lock);
+ p->tcf_qstats.drops++;
+ spin_unlock(&p->tcf_lock);
+ return TC_ACT_SHOT;
+}
+
+static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_nat *p = to_tcf_nat(a);
+ struct tc_nat opt = {
+ .old_addr = p->old_addr,
+ .new_addr = p->new_addr,
+ .mask = p->mask,
+ .flags = p->flags,
+
+ .index = p->tcf_index,
+ .action = p->tcf_action,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &p->tcf_tm);
+ if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_nat_ops = {
+ .kind = "nat",
+ .type = TCA_ACT_NAT,
+ .owner = THIS_MODULE,
+ .act = tcf_nat_act,
+ .dump = tcf_nat_dump,
+ .init = tcf_nat_init,
+ .walk = tcf_nat_walker,
+ .lookup = tcf_nat_search,
+ .size = sizeof(struct tcf_nat),
+};
+
+static __net_init int nat_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tc_action_net_init(net, tn, &act_nat_ops);
+}
+
+static void __net_exit nat_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, nat_net_id);
+}
+
+static struct pernet_operations nat_net_ops = {
+ .init = nat_init_net,
+ .exit_batch = nat_exit_net,
+ .id = &nat_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_DESCRIPTION("Stateless NAT actions");
+MODULE_LICENSE("GPL");
+
+static int __init nat_init_module(void)
+{
+ return tcf_register_action(&act_nat_ops, &nat_net_ops);
+}
+
+static void __exit nat_cleanup_module(void)
+{
+ tcf_unregister_action(&act_nat_ops, &nat_net_ops);
+}
+
+module_init(nat_init_module);
+module_exit(nat_cleanup_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
new file mode 100644
index 000000000..aeb8f84cb
--- /dev/null
+++ b/net/sched/act_pedit.c
@@ -0,0 +1,541 @@
+/*
+ * net/sched/act_pedit.c Generic packet editor
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2002-4)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_pedit.h>
+#include <uapi/linux/tc_act/tc_pedit.h>
+
+static unsigned int pedit_net_id;
+static struct tc_action_ops act_pedit_ops;
+
+static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
+ [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
+ [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
+ [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 },
+ [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+ u8 n)
+{
+ struct tcf_pedit_key_ex *keys_ex;
+ struct tcf_pedit_key_ex *k;
+ const struct nlattr *ka;
+ int err = -EINVAL;
+ int rem;
+
+ if (!nla)
+ return NULL;
+
+ keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+ if (!keys_ex)
+ return ERR_PTR(-ENOMEM);
+
+ k = keys_ex;
+
+ nla_for_each_nested(ka, nla, rem) {
+ struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
+
+ if (!n) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ n--;
+
+ if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
+ pedit_key_ex_policy, NULL);
+ if (err)
+ goto err_out;
+
+ if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+ !tb[TCA_PEDIT_KEY_EX_CMD]) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+ k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
+
+ if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+ k->cmd > TCA_PEDIT_CMD_MAX) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ k++;
+ }
+
+ if (n) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ return keys_ex;
+
+err_out:
+ kfree(keys_ex);
+ return ERR_PTR(err);
+}
+
+static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
+ struct tcf_pedit_key_ex *keys_ex, int n)
+{
+ struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+
+ if (!keys_start)
+ goto nla_failure;
+ for (; n > 0; n--) {
+ struct nlattr *key_start;
+
+ key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+ if (!key_start)
+ goto nla_failure;
+
+ if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+ nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd))
+ goto nla_failure;
+
+ nla_nest_end(skb, key_start);
+
+ keys_ex++;
+ }
+
+ nla_nest_end(skb, keys_start);
+
+ return 0;
+nla_failure:
+ nla_nest_cancel(skb, keys_start);
+ return -EINVAL;
+}
+
+static int tcf_pedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+ struct nlattr *tb[TCA_PEDIT_MAX + 1];
+ struct tc_pedit_key *keys = NULL;
+ struct tcf_pedit_key_ex *keys_ex;
+ struct tc_pedit *parm;
+ struct nlattr *pattr;
+ struct tcf_pedit *p;
+ int ret = 0, err;
+ int i, ksize;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
+ if (err < 0)
+ return err;
+
+ pattr = tb[TCA_PEDIT_PARMS];
+ if (!pattr)
+ pattr = tb[TCA_PEDIT_PARMS_EX];
+ if (!pattr) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute");
+ return -EINVAL;
+ }
+
+ parm = nla_data(pattr);
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ return -EINVAL;
+ }
+ ksize = parm->nkeys * sizeof(struct tc_pedit_key);
+ if (nla_len(pattr) < sizeof(*parm) + ksize) {
+ NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
+ return -EINVAL;
+ }
+
+ keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+ if (IS_ERR(keys_ex))
+ return PTR_ERR(keys_ex);
+
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_pedit_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ goto out_free;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind)
+ goto out_free;
+ if (!ovr) {
+ ret = -EEXIST;
+ goto out_release;
+ }
+ } else {
+ ret = err;
+ goto out_free;
+ }
+
+ p = to_pedit(*a);
+ spin_lock_bh(&p->tcf_lock);
+
+ if (ret == ACT_P_CREATED ||
+ (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
+ keys = kmalloc(ksize, GFP_ATOMIC);
+ if (!keys) {
+ spin_unlock_bh(&p->tcf_lock);
+ ret = -ENOMEM;
+ goto out_release;
+ }
+ kfree(p->tcfp_keys);
+ p->tcfp_keys = keys;
+ p->tcfp_nkeys = parm->nkeys;
+ }
+ memcpy(p->tcfp_keys, parm->keys, ksize);
+ p->tcfp_off_max_hint = 0;
+ for (i = 0; i < p->tcfp_nkeys; ++i) {
+ u32 cur = p->tcfp_keys[i].off;
+
+ /* sanitize the shift value for any later use */
+ p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1,
+ p->tcfp_keys[i].shift);
+
+ /* The AT option can read a single byte, we can bound the actual
+ * value with uchar max.
+ */
+ cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift;
+
+ /* Each key touches 4 bytes starting from the computed offset */
+ p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
+ }
+
+ p->tcfp_flags = parm->flags;
+ p->tcf_action = parm->action;
+
+ kfree(p->tcfp_keys_ex);
+ p->tcfp_keys_ex = keys_ex;
+
+ spin_unlock_bh(&p->tcf_lock);
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+
+out_release:
+ tcf_idr_release(*a, bind);
+out_free:
+ kfree(keys_ex);
+ return ret;
+
+}
+
+static void tcf_pedit_cleanup(struct tc_action *a)
+{
+ struct tcf_pedit *p = to_pedit(a);
+ struct tc_pedit_key *keys = p->tcfp_keys;
+
+ kfree(keys);
+ kfree(p->tcfp_keys_ex);
+}
+
+static bool offset_valid(struct sk_buff *skb, int offset)
+{
+ if (offset > 0 && offset > skb->len)
+ return false;
+
+ if (offset < 0 && -offset > skb_headroom(skb))
+ return false;
+
+ return true;
+}
+
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+ enum pedit_header_type htype, int *hoffset)
+{
+ int ret = -EINVAL;
+
+ switch (htype) {
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+ if (skb_mac_header_was_set(skb)) {
+ *hoffset = skb_mac_offset(skb);
+ ret = 0;
+ }
+ break;
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+ *hoffset = skb_network_offset(skb);
+ ret = 0;
+ break;
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+ if (skb_transport_header_was_set(skb)) {
+ *hoffset = skb_transport_offset(skb);
+ ret = 0;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_pedit *p = to_pedit(a);
+ u32 max_offset;
+ int i;
+
+ spin_lock(&p->tcf_lock);
+
+ max_offset = (skb_transport_header_was_set(skb) ?
+ skb_transport_offset(skb) :
+ skb_network_offset(skb)) +
+ p->tcfp_off_max_hint;
+ if (skb_ensure_writable(skb, min(skb->len, max_offset)))
+ goto unlock;
+
+ tcf_lastuse_update(&p->tcf_tm);
+
+ if (p->tcfp_nkeys > 0) {
+ struct tc_pedit_key *tkey = p->tcfp_keys;
+ struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
+ enum pedit_header_type htype =
+ TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
+
+ for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
+ u32 *ptr, hdata;
+ int offset = tkey->off;
+ int hoffset;
+ u32 val;
+ int rc;
+
+ if (tkey_ex) {
+ htype = tkey_ex->htype;
+ cmd = tkey_ex->cmd;
+
+ tkey_ex++;
+ }
+
+ rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+ if (rc) {
+ pr_info("tc action pedit bad header type specified (0x%x)\n",
+ htype);
+ goto bad;
+ }
+
+ if (tkey->offmask) {
+ u8 *d, _d;
+
+ if (!offset_valid(skb, hoffset + tkey->at)) {
+ pr_info("tc action pedit 'at' offset %d out of bounds\n",
+ hoffset + tkey->at);
+ goto bad;
+ }
+ d = skb_header_pointer(skb, hoffset + tkey->at,
+ sizeof(_d), &_d);
+ if (!d)
+ goto bad;
+ offset += (*d & tkey->offmask) >> tkey->shift;
+ }
+
+ if (offset % 4) {
+ pr_info("tc action pedit offset must be on 32 bit boundaries\n");
+ goto bad;
+ }
+
+ if (!offset_valid(skb, hoffset + offset)) {
+ pr_info("tc action pedit offset %d out of bounds\n",
+ hoffset + offset);
+ goto bad;
+ }
+
+ ptr = skb_header_pointer(skb, hoffset + offset,
+ sizeof(hdata), &hdata);
+ if (!ptr)
+ goto bad;
+ /* just do it, baby */
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ val = tkey->val;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ val = (*ptr + tkey->val) & ~tkey->mask;
+ break;
+ default:
+ pr_info("tc action pedit bad command (%d)\n",
+ cmd);
+ goto bad;
+ }
+
+ *ptr = ((*ptr & tkey->mask) ^ val);
+ if (ptr == &hdata)
+ skb_store_bits(skb, hoffset + offset, ptr, 4);
+ }
+
+ goto done;
+ } else {
+ WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+ }
+
+bad:
+ p->tcf_qstats.overlimits++;
+done:
+ bstats_update(&p->tcf_bstats, skb);
+unlock:
+ spin_unlock(&p->tcf_lock);
+ return p->tcf_action;
+}
+
+static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_pedit *p = to_pedit(a);
+ struct tc_pedit *opt;
+ struct tcf_t t;
+ int s;
+
+ s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
+
+ /* netlink spinlocks held above us - must use ATOMIC */
+ opt = kzalloc(s, GFP_ATOMIC);
+ if (unlikely(!opt))
+ return -ENOBUFS;
+
+ spin_lock_bh(&p->tcf_lock);
+ memcpy(opt->keys, p->tcfp_keys,
+ p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+ opt->index = p->tcf_index;
+ opt->nkeys = p->tcfp_nkeys;
+ opt->flags = p->tcfp_flags;
+ opt->action = p->tcf_action;
+ opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
+ opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
+
+ if (p->tcfp_keys_ex) {
+ if (tcf_pedit_key_ex_dump(skb,
+ p->tcfp_keys_ex,
+ p->tcfp_nkeys))
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
+ goto nla_put_failure;
+ } else {
+ if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+ goto nla_put_failure;
+ }
+
+ tcf_tm_dump(&t, &p->tcf_tm);
+ if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
+
+ kfree(opt);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
+ nlmsg_trim(skb, b);
+ kfree(opt);
+ return -1;
+}
+
+static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_pedit_ops = {
+ .kind = "pedit",
+ .type = TCA_ACT_PEDIT,
+ .owner = THIS_MODULE,
+ .act = tcf_pedit_act,
+ .dump = tcf_pedit_dump,
+ .cleanup = tcf_pedit_cleanup,
+ .init = tcf_pedit_init,
+ .walk = tcf_pedit_walker,
+ .lookup = tcf_pedit_search,
+ .size = sizeof(struct tcf_pedit),
+};
+
+static __net_init int pedit_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tc_action_net_init(net, tn, &act_pedit_ops);
+}
+
+static void __net_exit pedit_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, pedit_net_id);
+}
+
+static struct pernet_operations pedit_net_ops = {
+ .init = pedit_init_net,
+ .exit_batch = pedit_exit_net,
+ .id = &pedit_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Packet Editor actions");
+MODULE_LICENSE("GPL");
+
+static int __init pedit_init_module(void)
+{
+ return tcf_register_action(&act_pedit_ops, &pedit_net_ops);
+}
+
+static void __exit pedit_cleanup_module(void)
+{
+ tcf_unregister_action(&act_pedit_ops, &pedit_net_ops);
+}
+
+module_init(pedit_init_module);
+module_exit(pedit_cleanup_module);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
new file mode 100644
index 000000000..4db25959e
--- /dev/null
+++ b/net/sched/act_police.c
@@ -0,0 +1,371 @@
+/*
+ * net/sched/act_police.c Input police filter
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * J Hadi Salim (action changes)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+struct tcf_police {
+ struct tc_action common;
+ int tcfp_result;
+ u32 tcfp_ewma_rate;
+ s64 tcfp_burst;
+ u32 tcfp_mtu;
+ s64 tcfp_toks;
+ s64 tcfp_ptoks;
+ s64 tcfp_mtu_ptoks;
+ s64 tcfp_t_c;
+ struct psched_ratecfg rate;
+ bool rate_present;
+ struct psched_ratecfg peak;
+ bool peak_present;
+};
+
+#define to_police(pc) ((struct tcf_police *)pc)
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+ u32 index;
+ int action;
+ u32 limit;
+ u32 burst;
+ u32 mtu;
+ struct tc_ratespec rate;
+ struct tc_ratespec peakrate;
+};
+
+/* Each policer is serialized by its individual spinlock */
+
+static unsigned int police_net_id;
+static struct tc_action_ops act_police_ops;
+
+static int tcf_police_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
+ [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
+ [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
+ [TCA_POLICE_AVRATE] = { .type = NLA_U32 },
+ [TCA_POLICE_RESULT] = { .type = NLA_U32 },
+};
+
+static int tcf_police_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ int ret = 0, err;
+ struct nlattr *tb[TCA_POLICE_MAX + 1];
+ struct tc_police *parm;
+ struct tcf_police *police;
+ struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+ bool exists = false;
+ u32 index;
+ int size;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_POLICE_TBF] == NULL)
+ return -EINVAL;
+ size = nla_len(tb[TCA_POLICE_TBF]);
+ if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_POLICE_TBF]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, NULL, a,
+ &act_police_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ police = to_police(*a);
+ if (parm->rate.rate) {
+ err = -ENOMEM;
+ R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE], NULL);
+ if (R_tab == NULL)
+ goto failure;
+
+ if (parm->peakrate.rate) {
+ P_tab = qdisc_get_rtab(&parm->peakrate,
+ tb[TCA_POLICE_PEAKRATE], NULL);
+ if (P_tab == NULL)
+ goto failure;
+ }
+ }
+
+ if (est) {
+ err = gen_replace_estimator(&police->tcf_bstats, NULL,
+ &police->tcf_rate_est,
+ &police->tcf_lock,
+ NULL, est);
+ if (err)
+ goto failure;
+ } else if (tb[TCA_POLICE_AVRATE] &&
+ (ret == ACT_P_CREATED ||
+ !gen_estimator_active(&police->tcf_rate_est))) {
+ err = -EINVAL;
+ goto failure;
+ }
+
+ spin_lock_bh(&police->tcf_lock);
+ /* No failure allowed after this point */
+ police->tcfp_mtu = parm->mtu;
+ if (police->tcfp_mtu == 0) {
+ police->tcfp_mtu = ~0;
+ if (R_tab)
+ police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+ }
+ if (R_tab) {
+ police->rate_present = true;
+ psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+ qdisc_put_rtab(R_tab);
+ } else {
+ police->rate_present = false;
+ }
+ if (P_tab) {
+ police->peak_present = true;
+ psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+ qdisc_put_rtab(P_tab);
+ } else {
+ police->peak_present = false;
+ }
+
+ if (tb[TCA_POLICE_RESULT])
+ police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+ police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+ police->tcfp_toks = police->tcfp_burst;
+ if (police->peak_present) {
+ police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
+ police->tcfp_mtu);
+ police->tcfp_ptoks = police->tcfp_mtu_ptoks;
+ }
+ police->tcf_action = parm->action;
+
+ if (tb[TCA_POLICE_AVRATE])
+ police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+
+ spin_unlock_bh(&police->tcf_lock);
+ if (ret != ACT_P_CREATED)
+ return ret;
+
+ police->tcfp_t_c = ktime_get_ns();
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+failure:
+ qdisc_put_rtab(P_tab);
+ qdisc_put_rtab(R_tab);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_police *police = to_police(a);
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
+
+ spin_lock(&police->tcf_lock);
+
+ bstats_update(&police->tcf_bstats, skb);
+ tcf_lastuse_update(&police->tcf_tm);
+
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
+ }
+
+ if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
+ if (!police->rate_present) {
+ spin_unlock(&police->tcf_lock);
+ return police->tcfp_result;
+ }
+
+ now = ktime_get_ns();
+ toks = min_t(s64, now - police->tcfp_t_c,
+ police->tcfp_burst);
+ if (police->peak_present) {
+ ptoks = toks + police->tcfp_ptoks;
+ if (ptoks > police->tcfp_mtu_ptoks)
+ ptoks = police->tcfp_mtu_ptoks;
+ ptoks -= (s64) psched_l2t_ns(&police->peak,
+ qdisc_pkt_len(skb));
+ }
+ toks += police->tcfp_toks;
+ if (toks > police->tcfp_burst)
+ toks = police->tcfp_burst;
+ toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
+ if ((toks|ptoks) >= 0) {
+ police->tcfp_t_c = now;
+ police->tcfp_toks = toks;
+ police->tcfp_ptoks = ptoks;
+ if (police->tcfp_result == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcfp_result;
+ }
+ }
+
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+}
+
+static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_police *police = to_police(a);
+ struct tc_police opt = {
+ .index = police->tcf_index,
+ .refcnt = refcount_read(&police->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&police->tcf_lock);
+ opt.action = police->tcf_action;
+ opt.mtu = police->tcfp_mtu;
+ opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
+ if (police->rate_present)
+ psched_ratecfg_getrate(&opt.rate, &police->rate);
+ if (police->peak_present)
+ psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+ if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (police->tcfp_result &&
+ nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+ goto nla_put_failure;
+ if (police->tcfp_ewma_rate &&
+ nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+ goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+ t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
+ t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&police->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&police->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+MODULE_AUTHOR("Alexey Kuznetsov");
+MODULE_DESCRIPTION("Policing actions");
+MODULE_LICENSE("GPL");
+
+static struct tc_action_ops act_police_ops = {
+ .kind = "police",
+ .type = TCA_ID_POLICE,
+ .owner = THIS_MODULE,
+ .act = tcf_police_act,
+ .dump = tcf_police_dump,
+ .init = tcf_police_init,
+ .walk = tcf_police_walker,
+ .lookup = tcf_police_search,
+ .size = sizeof(struct tcf_police),
+};
+
+static __net_init int police_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tc_action_net_init(net, tn, &act_police_ops);
+}
+
+static void __net_exit police_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, police_net_id);
+}
+
+static struct pernet_operations police_net_ops = {
+ .init = police_init_net,
+ .exit_batch = police_exit_net,
+ .id = &police_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init police_init_module(void)
+{
+ return tcf_register_action(&act_police_ops, &police_net_ops);
+}
+
+static void __exit police_cleanup_module(void)
+{
+ tcf_unregister_action(&act_police_ops, &police_net_ops);
+}
+
+module_init(police_init_module);
+module_exit(police_cleanup_module);
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000..ea0738ceb
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,292 @@
+/*
+ * net/sched/act_sample.c - Packet sampling tc action
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+#include <linux/if_arp.h>
+
+static unsigned int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+ [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) },
+ [TCA_SAMPLE_RATE] = { .type = NLA_U32 },
+ [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 },
+ [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a, int ovr,
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+ struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+ struct psample_group *psample_group;
+ u32 psample_group_num, rate, index;
+ struct tc_sample *parm;
+ struct tcf_sample *s;
+ bool exists = false;
+ int ret, err;
+
+ if (!nla)
+ return -EINVAL;
+ ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL);
+ if (ret < 0)
+ return ret;
+ if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+ !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_sample_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ if (!rate) {
+ NL_SET_ERR_MSG(extack, "invalid sample rate");
+ tcf_idr_release(*a, bind);
+ return -EINVAL;
+ }
+ psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+ psample_group = psample_group_get(net, psample_group_num);
+ if (!psample_group) {
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ s = to_sample(*a);
+
+ spin_lock_bh(&s->tcf_lock);
+ s->tcf_action = parm->action;
+ s->rate = rate;
+ s->psample_group_num = psample_group_num;
+ rcu_swap_protected(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
+
+ if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
+ s->truncate = true;
+ s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+ }
+ spin_unlock_bh(&s->tcf_lock);
+
+ if (psample_group)
+ psample_group_put(psample_group);
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_sample_cleanup(struct tc_action *a)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *psample_group;
+
+ /* last reference to action, no need to lock */
+ psample_group = rcu_dereference_protected(s->psample_group, 1);
+ RCU_INIT_POINTER(s->psample_group, NULL);
+ if (psample_group)
+ psample_group_put(psample_group);
+}
+
+static bool tcf_sample_dev_ok_push(struct net_device *dev)
+{
+ switch (dev->type) {
+ case ARPHRD_TUNNEL:
+ case ARPHRD_TUNNEL6:
+ case ARPHRD_SIT:
+ case ARPHRD_IPGRE:
+ case ARPHRD_IP6GRE:
+ case ARPHRD_VOID:
+ case ARPHRD_NONE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *psample_group;
+ int retval;
+ int size;
+ int iif;
+ int oif;
+
+ tcf_lastuse_update(&s->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+ retval = READ_ONCE(s->tcf_action);
+
+ psample_group = rcu_dereference_bh(s->psample_group);
+
+ /* randomly sample packets according to rate */
+ if (psample_group && (prandom_u32() % s->rate == 0)) {
+ if (!skb_at_tc_ingress(skb)) {
+ iif = skb->skb_iif;
+ oif = skb->dev->ifindex;
+ } else {
+ iif = skb->dev->ifindex;
+ oif = 0;
+ }
+
+ /* on ingress, the mac header gets popped, so push it back */
+ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+ skb_push(skb, skb->mac_len);
+
+ size = s->truncate ? s->trunc_size : skb->len;
+ psample_sample_packet(psample_group, skb, size, iif, oif,
+ s->rate);
+
+ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+ skb_pull(skb, skb->mac_len);
+ }
+
+ return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_sample *s = to_sample(a);
+ struct tc_sample opt = {
+ .index = s->tcf_index,
+ .refcnt = refcount_read(&s->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&s->tcf_lock);
+ opt.action = s->tcf_action;
+ if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &s->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+ goto nla_put_failure;
+
+ if (s->truncate)
+ if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
+ goto nla_put_failure;
+ spin_unlock_bh(&s->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&s->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+ .kind = "sample",
+ .type = TCA_ACT_SAMPLE,
+ .owner = THIS_MODULE,
+ .act = tcf_sample_act,
+ .dump = tcf_sample_dump,
+ .init = tcf_sample_init,
+ .cleanup = tcf_sample_cleanup,
+ .walk = tcf_sample_walker,
+ .lookup = tcf_sample_search,
+ .size = sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tc_action_net_init(net, tn, &act_sample_ops);
+}
+
+static void __net_exit sample_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, sample_net_id);
+}
+
+static struct pernet_operations sample_net_ops = {
+ .init = sample_init_net,
+ .exit_batch = sample_exit_net,
+ .id = &sample_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+ return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+ tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
new file mode 100644
index 000000000..b418ef62e
--- /dev/null
+++ b/net/sched/act_simple.c
@@ -0,0 +1,251 @@
+/*
+ * net/sched/act_simple.c Simple example of an action
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2005-8)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#define TCA_ACT_SIMP 22
+
+#include <linux/tc_act/tc_defact.h>
+#include <net/tc_act/tc_defact.h>
+
+static unsigned int simp_net_id;
+static struct tc_action_ops act_simp_ops;
+
+#define SIMP_MAX_DATA 32
+static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_defact *d = to_defact(a);
+
+ spin_lock(&d->tcf_lock);
+ tcf_lastuse_update(&d->tcf_tm);
+ bstats_update(&d->tcf_bstats, skb);
+
+ /* print policy string followed by _ then packet count
+ * Example if this was the 3rd packet and the string was "hello"
+ * then it would look like "hello_3" (without quotes)
+ */
+ pr_info("simple: %s_%d\n",
+ (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+ spin_unlock(&d->tcf_lock);
+ return d->tcf_action;
+}
+
+static void tcf_simp_release(struct tc_action *a)
+{
+ struct tcf_defact *d = to_defact(a);
+ kfree(d->tcfd_defdata);
+}
+
+static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata)
+{
+ d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
+ if (unlikely(!d->tcfd_defdata))
+ return -ENOMEM;
+ nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ return 0;
+}
+
+static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata,
+ struct tc_defact *p)
+{
+ spin_lock_bh(&d->tcf_lock);
+ d->tcf_action = p->action;
+ memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
+ nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ spin_unlock_bh(&d->tcf_lock);
+}
+
+static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
+ [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
+ [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
+};
+
+static int tcf_simp_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+ struct nlattr *tb[TCA_DEF_MAX + 1];
+ struct tc_defact *parm;
+ struct tcf_defact *d;
+ bool exists = false;
+ int ret = 0, err;
+ u32 index;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_DEF_PARMS] == NULL)
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_DEF_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (tb[TCA_DEF_DATA] == NULL) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_simp_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ d = to_defact(*a);
+ ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
+ if (ret < 0) {
+ tcf_idr_release(*a, bind);
+ return ret;
+ }
+ d->tcf_action = parm->action;
+ ret = ACT_P_CREATED;
+ } else {
+ d = to_defact(*a);
+
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ reset_policy(d, tb[TCA_DEF_DATA], parm);
+ }
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_defact *d = to_defact(a);
+ struct tc_defact opt = {
+ .index = d->tcf_index,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&d->tcf_lock);
+ opt.action = d->tcf_action;
+ if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
+ nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
+ if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&d->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_simp_ops = {
+ .kind = "simple",
+ .type = TCA_ACT_SIMP,
+ .owner = THIS_MODULE,
+ .act = tcf_simp_act,
+ .dump = tcf_simp_dump,
+ .cleanup = tcf_simp_release,
+ .init = tcf_simp_init,
+ .walk = tcf_simp_walker,
+ .lookup = tcf_simp_search,
+ .size = sizeof(struct tcf_defact),
+};
+
+static __net_init int simp_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tc_action_net_init(net, tn, &act_simp_ops);
+}
+
+static void __net_exit simp_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, simp_net_id);
+}
+
+static struct pernet_operations simp_net_ops = {
+ .init = simp_init_net,
+ .exit_batch = simp_exit_net,
+ .id = &simp_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2005)");
+MODULE_DESCRIPTION("Simple example action");
+MODULE_LICENSE("GPL");
+
+static int __init simp_init_module(void)
+{
+ int ret = tcf_register_action(&act_simp_ops, &simp_net_ops);
+ if (!ret)
+ pr_info("Simple TC action Loaded\n");
+ return ret;
+}
+
+static void __exit simp_cleanup_module(void)
+{
+ tcf_unregister_action(&act_simp_ops, &simp_net_ops);
+}
+
+module_init(simp_init_module);
+module_exit(simp_cleanup_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 000000000..06648b799
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+static unsigned int skbedit_net_id;
+static struct tc_action_ops act_skbedit_ops;
+
+static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+ int action;
+
+ tcf_lastuse_update(&d->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ params = rcu_dereference_bh(d->params);
+ action = READ_ONCE(d->tcf_action);
+
+ if (params->flags & SKBEDIT_F_PRIORITY)
+ skb->priority = params->priority;
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD) {
+ int wlen = skb_network_offset(skb);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ break;
+
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ break;
+ }
+ }
+ if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
+ skb->dev->real_num_tx_queues > params->queue_mapping)
+ skb_set_queue_mapping(skb, params->queue_mapping);
+ if (params->flags & SKBEDIT_F_MARK) {
+ skb->mark &= ~params->mask;
+ skb->mark |= params->mark & params->mask;
+ }
+ if (params->flags & SKBEDIT_F_PTYPE)
+ skb->pkt_type = params->ptype;
+ return action;
+
+err:
+ qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+ [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
+ [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
+ [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
+ [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
+};
+
+static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ struct tcf_skbedit_params *params_old, *params_new;
+ struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+ struct tc_skbedit *parm;
+ struct tcf_skbedit *d;
+ u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
+ u16 *queue_mapping = NULL, *ptype = NULL;
+ bool exists = false;
+ int ret = 0, err;
+ u32 index;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_SKBEDIT_PARMS] == NULL)
+ return -EINVAL;
+
+ if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+ flags |= SKBEDIT_F_PRIORITY;
+ priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+ }
+
+ if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+ flags |= SKBEDIT_F_QUEUE_MAPPING;
+ queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+ }
+
+ if (tb[TCA_SKBEDIT_PTYPE] != NULL) {
+ ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]);
+ if (!skb_pkt_type_ok(*ptype))
+ return -EINVAL;
+ flags |= SKBEDIT_F_PTYPE;
+ }
+
+ if (tb[TCA_SKBEDIT_MARK] != NULL) {
+ flags |= SKBEDIT_F_MARK;
+ mark = nla_data(tb[TCA_SKBEDIT_MARK]);
+ }
+
+ if (tb[TCA_SKBEDIT_MASK] != NULL) {
+ flags |= SKBEDIT_F_MASK;
+ mask = nla_data(tb[TCA_SKBEDIT_MASK]);
+ }
+
+ if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+ u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+ if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+ flags |= SKBEDIT_F_INHERITDSFIELD;
+ }
+
+ parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!flags) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_skbedit_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ d = to_skbedit(*a);
+ ret = ACT_P_CREATED;
+ } else {
+ d = to_skbedit(*a);
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+
+ ASSERT_RTNL();
+
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ params_new->flags = flags;
+ if (flags & SKBEDIT_F_PRIORITY)
+ params_new->priority = *priority;
+ if (flags & SKBEDIT_F_QUEUE_MAPPING)
+ params_new->queue_mapping = *queue_mapping;
+ if (flags & SKBEDIT_F_MARK)
+ params_new->mark = *mark;
+ if (flags & SKBEDIT_F_PTYPE)
+ params_new->ptype = *ptype;
+ /* default behaviour is to use all the bits */
+ params_new->mask = 0xffffffff;
+ if (flags & SKBEDIT_F_MASK)
+ params_new->mask = *mask;
+
+ d->tcf_action = parm->action;
+ params_old = rtnl_dereference(d->params);
+ rcu_assign_pointer(d->params, params_new);
+ if (params_old)
+ kfree_rcu(params_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+ struct tc_skbedit opt = {
+ .index = d->tcf_index,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
+ .action = d->tcf_action,
+ };
+ u64 pure_flags = 0;
+ struct tcf_t t;
+
+ params = rtnl_dereference(d->params);
+
+ if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_PRIORITY) &&
+ nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+ nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_MARK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_PTYPE) &&
+ nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_MASK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask))
+ goto nla_put_failure;
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD)
+ pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+ if (pure_flags != 0 &&
+ nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void tcf_skbedit_cleanup(struct tc_action *a)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+
+ params = rcu_dereference_protected(d->params, 1);
+ if (params)
+ kfree_rcu(params, rcu);
+}
+
+static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+ .kind = "skbedit",
+ .type = TCA_ACT_SKBEDIT,
+ .owner = THIS_MODULE,
+ .act = tcf_skbedit_act,
+ .dump = tcf_skbedit_dump,
+ .init = tcf_skbedit_init,
+ .cleanup = tcf_skbedit_cleanup,
+ .walk = tcf_skbedit_walker,
+ .lookup = tcf_skbedit_search,
+ .size = sizeof(struct tcf_skbedit),
+};
+
+static __net_init int skbedit_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tc_action_net_init(net, tn, &act_skbedit_ops);
+}
+
+static void __net_exit skbedit_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, skbedit_net_id);
+}
+
+static struct pernet_operations skbedit_net_ops = {
+ .init = skbedit_init_net,
+ .exit_batch = skbedit_exit_net,
+ .id = &skbedit_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+ return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+ tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000..03a272af6
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,314 @@
+/*
+ * net/sched/act_skbmod.c skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+static unsigned int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ int action;
+ struct tcf_skbmod_params *p;
+ u64 flags;
+ int err;
+
+ tcf_lastuse_update(&d->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ action = READ_ONCE(d->tcf_action);
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ if (!skb->dev || skb->dev->type != ARPHRD_ETHER)
+ return action;
+
+ /* XXX: if you are going to edit more fields beyond ethernet header
+ * (example when you add IP header replacement or vlan swap)
+ * then MAX_EDIT_LEN needs to change appropriately
+ */
+ err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+ if (unlikely(err)) /* best policy is to drop on the floor */
+ goto drop;
+
+ p = rcu_dereference_bh(d->skbmod_p);
+ flags = p->flags;
+ if (flags & SKBMOD_F_DMAC)
+ ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+ if (flags & SKBMOD_F_SMAC)
+ ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+ if (flags & SKBMOD_F_ETYPE)
+ eth_hdr(skb)->h_proto = p->eth_type;
+
+ if (flags & SKBMOD_F_SWAPMAC) {
+ u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+ /*XXX: I am sure we can come up with more efficient swapping*/
+ ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+ ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+ ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+ }
+
+ return action;
+
+drop:
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+ [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) },
+ [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+ struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+ struct tcf_skbmod_params *p, *p_old;
+ struct tc_skbmod *parm;
+ u32 lflags = 0, index;
+ struct tcf_skbmod *d;
+ bool exists = false;
+ u8 *daddr = NULL;
+ u8 *saddr = NULL;
+ u16 eth_type = 0;
+ int ret = 0, err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_SKBMOD_PARMS])
+ return -EINVAL;
+
+ if (tb[TCA_SKBMOD_DMAC]) {
+ daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+ lflags |= SKBMOD_F_DMAC;
+ }
+
+ if (tb[TCA_SKBMOD_SMAC]) {
+ saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+ lflags |= SKBMOD_F_SMAC;
+ }
+
+ if (tb[TCA_SKBMOD_ETYPE]) {
+ eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+ lflags |= SKBMOD_F_ETYPE;
+ }
+
+ parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ index = parm->index;
+ if (parm->flags & SKBMOD_F_SWAPMAC)
+ lflags = SKBMOD_F_SWAPMAC;
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!lflags) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_skbmod_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ d = to_skbmod(*a);
+
+ p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+ if (unlikely(!p)) {
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ p->flags = lflags;
+ d->tcf_action = parm->action;
+
+ if (ovr)
+ spin_lock_bh(&d->tcf_lock);
+ /* Protected by tcf_lock if overwriting existing action. */
+ p_old = rcu_dereference_protected(d->skbmod_p, 1);
+
+ if (lflags & SKBMOD_F_DMAC)
+ ether_addr_copy(p->eth_dst, daddr);
+ if (lflags & SKBMOD_F_SMAC)
+ ether_addr_copy(p->eth_src, saddr);
+ if (lflags & SKBMOD_F_ETYPE)
+ p->eth_type = htons(eth_type);
+
+ rcu_assign_pointer(d->skbmod_p, p);
+ if (ovr)
+ spin_unlock_bh(&d->tcf_lock);
+
+ if (p_old)
+ kfree_rcu(p_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ struct tcf_skbmod_params *p;
+
+ p = rcu_dereference_protected(d->skbmod_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_skbmod_params *p;
+ struct tc_skbmod opt = {
+ .index = d->tcf_index,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&d->tcf_lock);
+ opt.action = d->tcf_action;
+ p = rcu_dereference_protected(d->skbmod_p,
+ lockdep_is_held(&d->tcf_lock));
+ opt.flags = p->flags;
+ if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_DMAC) &&
+ nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_SMAC) &&
+ nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_ETYPE) &&
+ nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&d->tcf_lock);
+ return skb->len;
+nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+ .kind = "skbmod",
+ .type = TCA_ACT_SKBMOD,
+ .owner = THIS_MODULE,
+ .act = tcf_skbmod_act,
+ .dump = tcf_skbmod_dump,
+ .init = tcf_skbmod_init,
+ .cleanup = tcf_skbmod_cleanup,
+ .walk = tcf_skbmod_walker,
+ .lookup = tcf_skbmod_search,
+ .size = sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tc_action_net_init(net, tn, &act_skbmod_ops);
+}
+
+static void __net_exit skbmod_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, skbmod_net_id);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+ .init = skbmod_init_net,
+ .exit_batch = skbmod_exit_net,
+ .id = &skbmod_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+ return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+ tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000..f43234be5
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/geneve.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+static unsigned int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ int action;
+
+ params = rcu_dereference_bh(t->params);
+
+ tcf_lastuse_update(&t->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ action = READ_ONCE(t->tcf_action);
+
+ switch (params->tcft_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ skb_dst_drop(skb);
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+ break;
+ default:
+ WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+ params->tcft_action);
+ break;
+ }
+
+ return action;
+}
+
+static const struct nla_policy
+enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
+ .len = 128 },
+};
+
+static int
+tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
+ int err, data_len, opt_len;
+ u8 *data;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+ return -EINVAL;
+ }
+
+ data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ if (data_len < 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+ return -ERANGE;
+ }
+ if (data_len % 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+ return -ERANGE;
+ }
+
+ opt_len = sizeof(struct geneve_opt) + data_len;
+ if (dst) {
+ struct geneve_opt *opt = dst;
+
+ WARN_ON(dst_len < opt_len);
+
+ opt->opt_class =
+ nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
+ opt->length = data_len / 4; /* length is in units of 4 bytes */
+ opt->r1 = 0;
+ opt->r2 = 0;
+ opt->r3 = 0;
+
+ memcpy(opt + 1, data, data_len);
+ }
+
+ return opt_len;
+}
+
+static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
+ int dst_len, struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ const struct nlattr *attr, *head = nla_data(nla);
+
+ err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(attr, head, len, rem) {
+ switch (nla_type(attr)) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ opt_len = tunnel_key_copy_geneve_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
+ if (dst) {
+ dst_len -= opt_len;
+ dst += opt_len;
+ }
+ break;
+ }
+ }
+
+ if (!opts_len) {
+ NL_SET_ERR_MSG(extack, "Empty list of tunnel options");
+ return -EINVAL;
+ }
+
+ if (rem > 0) {
+ NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes");
+ return -EINVAL;
+ }
+
+ return opts_len;
+}
+
+static int tunnel_key_get_opts_len(struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ return tunnel_key_copy_opts(nla, NULL, 0, extack);
+}
+
+static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
+ int opts_len, struct netlink_ext_ack *extack)
+{
+ info->options_len = opts_len;
+ switch (nla_type(nla_data(nla))) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ default:
+ NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
+ return -EINVAL;
+ }
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+ [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
+ [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
+ [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
+};
+
+static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
+{
+ if (!p)
+ return;
+ if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+ dst_release(&p->tcft_enc_metadata->dst);
+ kfree_rcu(p, rcu);
+}
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+ struct tcf_tunnel_key_params *params_new;
+ struct metadata_dst *metadata = NULL;
+ struct tc_tunnel_key *parm;
+ struct tcf_tunnel_key *t;
+ bool exists = false;
+ __be16 dst_port = 0;
+ int opts_len = 0;
+ __be64 key_id;
+ __be16 flags;
+ u8 tos, ttl;
+ int ret = 0;
+ u32 index;
+ int err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
+ extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
+ return err;
+ }
+
+ if (!tb[TCA_TUNNEL_KEY_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key parameters");
+ return -EINVAL;
+ }
+
+ parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ switch (parm->t_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key id");
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+ flags = TUNNEL_KEY | TUNNEL_CSUM;
+ if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
+ nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
+ flags &= ~TUNNEL_CSUM;
+
+ if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
+ dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
+
+ if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) {
+ opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ extack);
+ if (opts_len < 0) {
+ ret = opts_len;
+ goto err_out;
+ }
+ }
+
+ tos = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TOS])
+ tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
+ ttl = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TTL])
+ ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
+
+ if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+ __be32 saddr;
+ __be32 daddr;
+
+ saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+ daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+ metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
+ dst_port, flags,
+ key_id, opts_len);
+ } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+
+ saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+ daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
+ 0, flags,
+ key_id, opts_len);
+ } else {
+ NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ if (!metadata) {
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst");
+ ret = -ENOMEM;
+ goto err_out;
+ }
+
+ if (opts_len) {
+ ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ &metadata->u.tun_info,
+ opts_len, extack);
+ if (ret < 0)
+ goto release_tun_meta;
+ }
+
+ metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unknown tunnel key action");
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_tunnel_key_ops, bind, true);
+ if (ret) {
+ NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
+ goto release_tun_meta;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ NL_SET_ERR_MSG(extack, "TC IDR already exists");
+ ret = -EEXIST;
+ goto release_tun_meta;
+ }
+
+ t = to_tunnel_key(*a);
+
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
+ ret = -ENOMEM;
+ exists = true;
+ goto release_tun_meta;
+ }
+ params_new->tcft_action = parm->t_action;
+ params_new->tcft_enc_metadata = metadata;
+
+ spin_lock_bh(&t->tcf_lock);
+ t->tcf_action = parm->action;
+ rcu_swap_protected(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
+ spin_unlock_bh(&t->tcf_lock);
+ tunnel_key_release_params(params_new);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+release_tun_meta:
+ if (metadata)
+ dst_release(&metadata->dst);
+
+err_out:
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+
+ params = rcu_dereference_protected(t->params, 1);
+ tunnel_key_release_params(params);
+}
+
+static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ int len = info->options_len;
+ u8 *src = (u8 *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+ if (!start)
+ return -EMSGSIZE;
+
+ while (len > 0) {
+ struct geneve_opt *opt = (struct geneve_opt *)src;
+
+ if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
+ opt->length * 4, opt + 1)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ len -= sizeof(struct geneve_opt) + opt->length * 4;
+ src += sizeof(struct geneve_opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct nlattr *start;
+ int err = -EINVAL;
+
+ if (!info->options_len)
+ return 0;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ err = tunnel_key_geneve_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else {
+err_out:
+ nla_nest_cancel(skb, start);
+ return err;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ unsigned short family = ip_tunnel_info_af(info);
+
+ if (family == AF_INET) {
+ __be32 saddr = info->key.u.ipv4.src;
+ __be32 daddr = info->key.u.ipv4.dst;
+
+ if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+ !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+ return 0;
+ }
+
+ if (family == AF_INET6) {
+ const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+ const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+ if (!nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+ !nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ struct tc_tunnel_key opt = {
+ .index = t->tcf_index,
+ .refcnt = refcount_read(&t->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
+ };
+ struct tcf_t tm;
+
+ spin_lock_bh(&t->tcf_lock);
+ params = rcu_dereference_protected(t->params,
+ lockdep_is_held(&t->tcf_lock));
+ opt.action = t->tcf_action;
+ opt.t_action = params->tcft_action;
+
+ if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+ struct ip_tunnel_info *info =
+ &params->tcft_enc_metadata->u.tun_info;
+ struct ip_tunnel_key *key = &info->key;
+ __be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+ if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+ tunnel_key_dump_addresses(skb,
+ &params->tcft_enc_metadata->u.tun_info) ||
+ nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
+ !(key->tun_flags & TUNNEL_CSUM)) ||
+ tunnel_key_opts_dump(skb, info))
+ goto nla_put_failure;
+
+ if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
+ goto nla_put_failure;
+
+ if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
+ goto nla_put_failure;
+ }
+
+ tcf_tm_dump(&tm, &t->tcf_tm);
+ if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+ &tm, TCA_TUNNEL_KEY_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&t->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&t->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+ .kind = "tunnel_key",
+ .type = TCA_ACT_TUNNEL_KEY,
+ .owner = THIS_MODULE,
+ .act = tunnel_key_act,
+ .dump = tunnel_key_dump,
+ .init = tunnel_key_init,
+ .cleanup = tunnel_key_release,
+ .walk = tunnel_key_walker,
+ .lookup = tunnel_key_search,
+ .size = sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tc_action_net_init(net, tn, &act_tunnel_key_ops);
+}
+
+static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, tunnel_key_net_id);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+ .init = tunnel_key_init_net,
+ .exit_batch = tunnel_key_exit_net,
+ .id = &tunnel_key_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+ return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+ tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
new file mode 100644
index 000000000..41528b966
--- /dev/null
+++ b/net/sched/act_vlan.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_vlan.h>
+
+static unsigned int vlan_net_id;
+static struct tc_action_ops act_vlan_ops;
+
+static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_vlan_params *p;
+ int action;
+ int err;
+ u16 tci;
+
+ tcf_lastuse_update(&v->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+
+ /* Ensure 'data' points at mac_header prior calling vlan manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb))
+ skb_push_rcsum(skb, skb->mac_len);
+
+ action = READ_ONCE(v->tcf_action);
+
+ p = rcu_dereference_bh(v->vlan_p);
+
+ switch (p->tcfv_action) {
+ case TCA_VLAN_ACT_POP:
+ err = skb_vlan_pop(skb);
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_PUSH:
+ err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid |
+ (p->tcfv_push_prio << VLAN_PRIO_SHIFT));
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_MODIFY:
+ /* No-op if no vlan tag (either hw-accel or in-payload) */
+ if (!skb_vlan_tagged(skb))
+ goto out;
+ /* extract existing tag (and guarantee no hw-accel tag) */
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ skb->vlan_tci = 0;
+ } else {
+ /* in-payload vlan tag, pop it */
+ err = __skb_vlan_pop(skb, &tci);
+ if (err)
+ goto drop;
+ }
+ /* replace the vid */
+ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid;
+ /* replace prio bits, if tcfv_push_prio specified */
+ if (p->tcfv_push_prio) {
+ tci &= ~VLAN_PRIO_MASK;
+ tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT;
+ }
+ /* put updated tci as hwaccel tag */
+ __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci);
+ break;
+ default:
+ BUG();
+ }
+
+out:
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
+ return action;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
+ [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
+ [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
+ [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
+ [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
+};
+
+static int tcf_vlan_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+ struct nlattr *tb[TCA_VLAN_MAX + 1];
+ struct tcf_vlan_params *p;
+ struct tc_vlan *parm;
+ struct tcf_vlan *v;
+ int action;
+ u16 push_vid = 0;
+ __be16 push_proto = 0;
+ u8 push_prio = 0;
+ bool exists = false;
+ int ret = 0, err;
+ u32 index;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_VLAN_PARMS])
+ return -EINVAL;
+ parm = nla_data(tb[TCA_VLAN_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ switch (parm->v_action) {
+ case TCA_VLAN_ACT_POP:
+ break;
+ case TCA_VLAN_ACT_PUSH:
+ case TCA_VLAN_ACT_MODIFY:
+ if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+ push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
+ if (push_vid >= VLAN_VID_MASK) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -ERANGE;
+ }
+
+ if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) {
+ push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]);
+ switch (push_proto) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ break;
+ default:
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EPROTONOSUPPORT;
+ }
+ } else {
+ push_proto = htons(ETH_P_8021Q);
+ }
+
+ if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
+ push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
+ break;
+ default:
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+ action = parm->v_action;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_vlan_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ v = to_vlan(*a);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ p->tcfv_action = action;
+ p->tcfv_push_vid = push_vid;
+ p->tcfv_push_prio = push_prio;
+ p->tcfv_push_proto = push_proto;
+
+ spin_lock_bh(&v->tcf_lock);
+ v->tcf_action = parm->action;
+ rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ spin_unlock_bh(&v->tcf_lock);
+
+ if (p)
+ kfree_rcu(p, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_vlan_cleanup(struct tc_action *a)
+{
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_vlan_params *p;
+
+ p = rcu_dereference_protected(v->vlan_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_vlan_params *p;
+ struct tc_vlan opt = {
+ .index = v->tcf_index,
+ .refcnt = refcount_read(&v->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&v->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&v->tcf_lock);
+ opt.action = v->tcf_action;
+ p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+ opt.v_action = p->tcfv_action;
+ if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if ((p->tcfv_action == TCA_VLAN_ACT_PUSH ||
+ p->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
+ (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) ||
+ nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
+ p->tcfv_push_proto) ||
+ (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
+ p->tcfv_push_prio))))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &v->tcf_tm);
+ if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&v->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&v->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static size_t tcf_vlan_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_vlan))
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */
+ + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */
+}
+
+static struct tc_action_ops act_vlan_ops = {
+ .kind = "vlan",
+ .type = TCA_ACT_VLAN,
+ .owner = THIS_MODULE,
+ .act = tcf_vlan_act,
+ .dump = tcf_vlan_dump,
+ .init = tcf_vlan_init,
+ .cleanup = tcf_vlan_cleanup,
+ .walk = tcf_vlan_walker,
+ .get_fill_size = tcf_vlan_get_fill_size,
+ .lookup = tcf_vlan_search,
+ .size = sizeof(struct tcf_vlan),
+};
+
+static __net_init int vlan_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tc_action_net_init(net, tn, &act_vlan_ops);
+}
+
+static void __net_exit vlan_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, vlan_net_id);
+}
+
+static struct pernet_operations vlan_net_ops = {
+ .init = vlan_init_net,
+ .exit_batch = vlan_exit_net,
+ .id = &vlan_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init vlan_init_module(void)
+{
+ return tcf_register_action(&act_vlan_ops, &vlan_net_ops);
+}
+
+static void __exit vlan_cleanup_module(void)
+{
+ tcf_unregister_action(&act_vlan_ops, &vlan_net_ops);
+}
+
+module_init(vlan_init_module);
+module_exit(vlan_cleanup_module);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("vlan manipulation actions");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
new file mode 100644
index 000000000..435911dc9
--- /dev/null
+++ b/net/sched/cls_api.c
@@ -0,0 +1,2320 @@
+/*
+ * net/sched/cls_api.c Packet classifier API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+
+/* The list of all installed classifier types */
+static LIST_HEAD(tcf_proto_base);
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(cls_mod_lock);
+
+/* Find classifier type by string name */
+
+static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
+{
+ const struct tcf_proto_ops *t, *res = NULL;
+
+ if (kind) {
+ read_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head) {
+ if (strcmp(kind, t->kind) == 0) {
+ if (try_module_get(t->owner))
+ res = t;
+ break;
+ }
+ }
+ read_unlock(&cls_mod_lock);
+ }
+ return res;
+}
+
+static const struct tcf_proto_ops *
+tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+
+ ops = __tcf_proto_lookup_ops(kind);
+ if (ops)
+ return ops;
+#ifdef CONFIG_MODULES
+ rtnl_unlock();
+ request_module("cls_%s", kind);
+ rtnl_lock();
+ ops = __tcf_proto_lookup_ops(kind);
+ /* We dropped the RTNL semaphore in order to perform
+ * the module load. So, even if we succeeded in loading
+ * the module we have to replay the request. We indicate
+ * this using -EAGAIN.
+ */
+ if (ops) {
+ module_put(ops->owner);
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ NL_SET_ERR_MSG(extack, "TC classifier not found");
+ return ERR_PTR(-ENOENT);
+}
+
+/* Register(unregister) new classifier type */
+
+int register_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+ struct tcf_proto_ops *t;
+ int rc = -EEXIST;
+
+ write_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head)
+ if (!strcmp(ops->kind, t->kind))
+ goto out;
+
+ list_add_tail(&ops->head, &tcf_proto_base);
+ rc = 0;
+out:
+ write_unlock(&cls_mod_lock);
+ return rc;
+}
+EXPORT_SYMBOL(register_tcf_proto_ops);
+
+static struct workqueue_struct *tc_filter_wq;
+
+int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+ struct tcf_proto_ops *t;
+ int rc = -ENOENT;
+
+ /* Wait for outstanding call_rcu()s, if any, from a
+ * tcf_proto_ops's destroy() handler.
+ */
+ rcu_barrier();
+ flush_workqueue(tc_filter_wq);
+
+ write_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head) {
+ if (t == ops) {
+ list_del(&t->head);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&cls_mod_lock);
+ return rc;
+}
+EXPORT_SYMBOL(unregister_tcf_proto_ops);
+
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
+{
+ INIT_RCU_WORK(rwork, func);
+ return queue_rcu_work(tc_filter_wq, rwork);
+}
+EXPORT_SYMBOL(tcf_queue_work);
+
+/* Select new prio value from the range, managed by kernel. */
+
+static inline u32 tcf_auto_prio(struct tcf_proto *tp)
+{
+ u32 first = TC_H_MAKE(0xC0000000U, 0U);
+
+ if (tp)
+ first = tp->prio - 1;
+
+ return TC_H_MAJ(first);
+}
+
+static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
+ u32 prio, struct tcf_chain *chain,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_proto *tp;
+ int err;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOBUFS);
+
+ tp->ops = tcf_proto_lookup_ops(kind, extack);
+ if (IS_ERR(tp->ops)) {
+ err = PTR_ERR(tp->ops);
+ goto errout;
+ }
+ tp->classify = tp->ops->classify;
+ tp->protocol = protocol;
+ tp->prio = prio;
+ tp->chain = chain;
+
+ err = tp->ops->init(tp);
+ if (err) {
+ module_put(tp->ops->owner);
+ goto errout;
+ }
+ return tp;
+
+errout:
+ kfree(tp);
+ return ERR_PTR(err);
+}
+
+static void tcf_proto_destroy(struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ tp->ops->destroy(tp, extack);
+ module_put(tp->ops->owner);
+ kfree_rcu(tp, rcu);
+}
+
+struct tcf_filter_chain_list_item {
+ struct list_head list;
+ tcf_chain_head_change_t *chain_head_change;
+ void *chain_head_change_priv;
+};
+
+static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
+ u32 chain_index)
+{
+ struct tcf_chain *chain;
+
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (!chain)
+ return NULL;
+ list_add_tail(&chain->list, &block->chain_list);
+ chain->block = block;
+ chain->index = chain_index;
+ chain->refcnt = 1;
+ if (!chain->index)
+ block->chain0.chain = chain;
+ return chain;
+}
+
+static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
+ struct tcf_proto *tp_head)
+{
+ if (item->chain_head_change)
+ item->chain_head_change(tp_head, item->chain_head_change_priv);
+}
+
+static void tcf_chain0_head_change(struct tcf_chain *chain,
+ struct tcf_proto *tp_head)
+{
+ struct tcf_filter_chain_list_item *item;
+ struct tcf_block *block = chain->block;
+
+ if (chain->index)
+ return;
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list)
+ tcf_chain_head_change_item(item, tp_head);
+}
+
+static void tcf_chain_destroy(struct tcf_chain *chain)
+{
+ struct tcf_block *block = chain->block;
+
+ list_del(&chain->list);
+ if (!chain->index)
+ block->chain0.chain = NULL;
+ kfree(chain);
+ if (list_empty(&block->chain_list) && block->refcnt == 0)
+ kfree(block);
+}
+
+static void tcf_chain_hold(struct tcf_chain *chain)
+{
+ ++chain->refcnt;
+}
+
+static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
+{
+ /* In case all the references are action references, this
+ * chain should not be shown to the user.
+ */
+ return chain->refcnt == chain->action_refcnt;
+}
+
+static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
+ u32 chain_index)
+{
+ struct tcf_chain *chain;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ if (chain->index == chain_index)
+ return chain;
+ }
+ return NULL;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast);
+
+static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
+ u32 chain_index, bool create,
+ bool by_act)
+{
+ struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+
+ if (chain) {
+ tcf_chain_hold(chain);
+ } else {
+ if (!create)
+ return NULL;
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain)
+ return NULL;
+ }
+
+ if (by_act)
+ ++chain->action_refcnt;
+
+ /* Send notification only in case we got the first
+ * non-action reference. Until then, the chain acts only as
+ * a placeholder for actions pointing to it and user ought
+ * not know about them.
+ */
+ if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+
+ return chain;
+}
+
+static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
+ bool create)
+{
+ return __tcf_chain_get(block, chain_index, create, false);
+}
+
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index)
+{
+ return __tcf_chain_get(block, chain_index, true, true);
+}
+EXPORT_SYMBOL(tcf_chain_get_by_act);
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain);
+
+static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
+{
+ if (by_act)
+ chain->action_refcnt--;
+ chain->refcnt--;
+
+ /* The last dropped non-action reference will trigger notification. */
+ if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
+ tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
+
+ if (chain->refcnt == 0) {
+ tc_chain_tmplt_del(chain);
+ tcf_chain_destroy(chain);
+ }
+}
+
+static void tcf_chain_put(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, false);
+}
+
+void tcf_chain_put_by_act(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, true);
+}
+EXPORT_SYMBOL(tcf_chain_put_by_act);
+
+static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
+{
+ if (chain->explicitly_created)
+ tcf_chain_put(chain);
+}
+
+static void tcf_chain_flush(struct tcf_chain *chain)
+{
+ struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
+
+ tcf_chain0_head_change(chain, NULL);
+ while (tp) {
+ RCU_INIT_POINTER(chain->filter_chain, tp->next);
+ tcf_proto_destroy(tp, NULL);
+ tp = rtnl_dereference(chain->filter_chain);
+ tcf_chain_put(chain);
+ }
+}
+
+static bool tcf_block_offload_in_use(struct tcf_block *block)
+{
+ return block->offloadcnt;
+}
+
+static int tcf_block_offload_cmd(struct tcf_block *block,
+ struct net_device *dev,
+ struct tcf_block_ext_info *ei,
+ enum tc_block_command command,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_block_offload bo = {};
+
+ bo.command = command;
+ bo.binder_type = ei->binder_type;
+ bo.block = block;
+ bo.extack = extack;
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+}
+
+static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = q->dev_queue->dev;
+ int err;
+
+ if (!dev->netdev_ops->ndo_setup_tc)
+ goto no_offload_dev_inc;
+
+ /* If tc offload feature is disabled and the block we try to bind
+ * to already has some offloaded filters, forbid to bind.
+ */
+ if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+ NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
+ return -EOPNOTSUPP;
+ }
+
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+ if (err == -EOPNOTSUPP)
+ goto no_offload_dev_inc;
+ return err;
+
+no_offload_dev_inc:
+ if (tcf_block_offload_in_use(block))
+ return -EOPNOTSUPP;
+ block->nooffloaddevcnt++;
+ return 0;
+}
+
+static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei)
+{
+ struct net_device *dev = q->dev_queue->dev;
+ int err;
+
+ if (!dev->netdev_ops->ndo_setup_tc)
+ goto no_offload_dev_dec;
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ if (err == -EOPNOTSUPP)
+ goto no_offload_dev_dec;
+ return;
+
+no_offload_dev_dec:
+ WARN_ON(block->nooffloaddevcnt-- == 0);
+}
+
+static int
+tcf_chain0_head_change_cb_add(struct tcf_block *block,
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain *chain0 = block->chain0.chain;
+ struct tcf_filter_chain_list_item *item;
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed");
+ return -ENOMEM;
+ }
+ item->chain_head_change = ei->chain_head_change;
+ item->chain_head_change_priv = ei->chain_head_change_priv;
+ if (chain0 && chain0->filter_chain)
+ tcf_chain_head_change_item(item, chain0->filter_chain);
+ list_add(&item->list, &block->chain0.filter_chain_list);
+ return 0;
+}
+
+static void
+tcf_chain0_head_change_cb_del(struct tcf_block *block,
+ struct tcf_block_ext_info *ei)
+{
+ struct tcf_chain *chain0 = block->chain0.chain;
+ struct tcf_filter_chain_list_item *item;
+
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
+ if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
+ (item->chain_head_change == ei->chain_head_change &&
+ item->chain_head_change_priv == ei->chain_head_change_priv)) {
+ if (chain0)
+ tcf_chain_head_change_item(item, NULL);
+ list_del(&item->list);
+ kfree(item);
+ return;
+ }
+ }
+ WARN_ON(1);
+}
+
+struct tcf_net {
+ struct idr idr;
+};
+
+static unsigned int tcf_net_id;
+
+static int tcf_block_insert(struct tcf_block *block, struct net *net,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+ return idr_alloc_u32(&tn->idr, block, &block->index, block->index,
+ GFP_KERNEL);
+}
+
+static void tcf_block_remove(struct tcf_block *block, struct net *net)
+{
+ struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+ idr_remove(&tn->idr, block->index);
+}
+
+static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
+ u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
+
+ block = kzalloc(sizeof(*block), GFP_KERNEL);
+ if (!block) {
+ NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&block->chain_list);
+ INIT_LIST_HEAD(&block->cb_list);
+ INIT_LIST_HEAD(&block->owner_list);
+ INIT_LIST_HEAD(&block->chain0.filter_chain_list);
+
+ block->refcnt = 1;
+ block->net = net;
+ block->index = block_index;
+
+ /* Don't store q pointer for blocks which are shared */
+ if (!tcf_block_shared(block))
+ block->q = q;
+ return block;
+}
+
+static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
+{
+ struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+ return idr_find(&tn->idr, block_index);
+}
+
+/* Find tcf block.
+ * Set q, parent, cl when appropriate.
+ */
+
+static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
+ u32 *parent, unsigned long *cl,
+ int ifindex, u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
+ int err = 0;
+
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_lookup(net, block_index);
+ if (!block) {
+ NL_SET_ERR_MSG(extack, "Block of given index was not found");
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+
+ rcu_read_lock();
+
+ /* Find link */
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return ERR_PTR(-ENODEV);
+ }
+
+ /* Find qdisc */
+ if (!*parent) {
+ *q = dev->qdisc;
+ *parent = (*q)->handle;
+ } else {
+ *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+ }
+
+ *q = qdisc_refcount_inc_nz(*q);
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+
+ /* Is it classful? */
+ cops = (*q)->ops->cl_ops;
+ if (!cops) {
+ NL_SET_ERR_MSG(extack, "Qdisc not classful");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+
+ if (!cops->tcf_block) {
+ NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
+ err = -EOPNOTSUPP;
+ goto errout_rcu;
+ }
+
+ /* At this point we know that qdisc is not noop_qdisc,
+ * which means that qdisc holds a reference to net_device
+ * and we hold a reference to qdisc, so it is safe to release
+ * rcu read lock.
+ */
+ rcu_read_unlock();
+
+ /* Do we search for filter, attached to class? */
+ if (TC_H_MIN(*parent)) {
+ *cl = cops->find(*q, *parent);
+ if (*cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
+ err = -ENOENT;
+ goto errout_qdisc;
+ }
+ }
+
+ /* And the last stroke */
+ block = cops->tcf_block(*q, *cl, extack);
+ if (!block) {
+ err = -EINVAL;
+ goto errout_qdisc;
+ }
+ if (tcf_block_shared(block)) {
+ NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+ err = -EOPNOTSUPP;
+ goto errout_qdisc;
+ }
+ }
+
+ return block;
+
+errout_rcu:
+ rcu_read_unlock();
+errout_qdisc:
+ if (*q) {
+ qdisc_put(*q);
+ *q = NULL;
+ }
+ return ERR_PTR(err);
+}
+
+static void tcf_block_release(struct Qdisc *q, struct tcf_block *block)
+{
+ if (q)
+ qdisc_put(q);
+}
+
+struct tcf_block_owner_item {
+ struct list_head list;
+ struct Qdisc *q;
+ enum tcf_block_binder_type binder_type;
+};
+
+static void
+tcf_block_owner_netif_keep_dst(struct tcf_block *block,
+ struct Qdisc *q,
+ enum tcf_block_binder_type binder_type)
+{
+ if (block->keep_dst &&
+ binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+ binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ netif_keep_dst(qdisc_dev(q));
+}
+
+void tcf_block_netif_keep_dst(struct tcf_block *block)
+{
+ struct tcf_block_owner_item *item;
+
+ block->keep_dst = true;
+ list_for_each_entry(item, &block->owner_list, list)
+ tcf_block_owner_netif_keep_dst(block, item->q,
+ item->binder_type);
+}
+EXPORT_SYMBOL(tcf_block_netif_keep_dst);
+
+static int tcf_block_owner_add(struct tcf_block *block,
+ struct Qdisc *q,
+ enum tcf_block_binder_type binder_type)
+{
+ struct tcf_block_owner_item *item;
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ return -ENOMEM;
+ item->q = q;
+ item->binder_type = binder_type;
+ list_add(&item->list, &block->owner_list);
+ return 0;
+}
+
+static void tcf_block_owner_del(struct tcf_block *block,
+ struct Qdisc *q,
+ enum tcf_block_binder_type binder_type)
+{
+ struct tcf_block_owner_item *item;
+
+ list_for_each_entry(item, &block->owner_list, list) {
+ if (item->q == q && item->binder_type == binder_type) {
+ list_del(&item->list);
+ kfree(item);
+ return;
+ }
+ }
+ WARN_ON(1);
+}
+
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = qdisc_net(q);
+ struct tcf_block *block = NULL;
+ bool created = false;
+ int err;
+
+ if (ei->block_index) {
+ /* block_index not 0 means the shared block is requested */
+ block = tcf_block_lookup(net, ei->block_index);
+ if (block)
+ block->refcnt++;
+ }
+
+ if (!block) {
+ block = tcf_block_create(net, q, ei->block_index, extack);
+ if (IS_ERR(block))
+ return PTR_ERR(block);
+ created = true;
+ if (tcf_block_shared(block)) {
+ err = tcf_block_insert(block, net, extack);
+ if (err)
+ goto err_block_insert;
+ }
+ }
+
+ err = tcf_block_owner_add(block, q, ei->binder_type);
+ if (err)
+ goto err_block_owner_add;
+
+ tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
+
+ err = tcf_chain0_head_change_cb_add(block, ei, extack);
+ if (err)
+ goto err_chain0_head_change_cb_add;
+
+ err = tcf_block_offload_bind(block, q, ei, extack);
+ if (err)
+ goto err_block_offload_bind;
+
+ *p_block = block;
+ return 0;
+
+err_block_offload_bind:
+ tcf_chain0_head_change_cb_del(block, ei);
+err_chain0_head_change_cb_add:
+ tcf_block_owner_del(block, q, ei->binder_type);
+err_block_owner_add:
+ if (created) {
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, net);
+err_block_insert:
+ kfree(block);
+ } else {
+ block->refcnt--;
+ }
+ return err;
+}
+EXPORT_SYMBOL(tcf_block_get_ext);
+
+static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv)
+{
+ struct tcf_proto __rcu **p_filter_chain = priv;
+
+ rcu_assign_pointer(*p_filter_chain, tp_head);
+}
+
+int tcf_block_get(struct tcf_block **p_block,
+ struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block_ext_info ei = {
+ .chain_head_change = tcf_chain_head_change_dflt,
+ .chain_head_change_priv = p_filter_chain,
+ };
+
+ WARN_ON(!p_filter_chain);
+ return tcf_block_get_ext(p_block, q, &ei, extack);
+}
+EXPORT_SYMBOL(tcf_block_get);
+
+/* XXX: Standalone actions are not allowed to jump to any chain, and bound
+ * actions should be all removed after flushing.
+ */
+void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei)
+{
+ struct tcf_chain *chain, *tmp;
+
+ if (!block)
+ return;
+ tcf_chain0_head_change_cb_del(block, ei);
+ tcf_block_owner_del(block, q, ei->binder_type);
+
+ if (block->refcnt == 1) {
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, block->net);
+
+ /* Hold a refcnt for all chains, so that they don't disappear
+ * while we are iterating.
+ */
+ list_for_each_entry(chain, &block->chain_list, list)
+ tcf_chain_hold(chain);
+
+ list_for_each_entry(chain, &block->chain_list, list)
+ tcf_chain_flush(chain);
+ }
+
+ tcf_block_offload_unbind(block, q, ei);
+
+ if (block->refcnt == 1) {
+ /* At this point, all the chains should have refcnt >= 1. */
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+ tcf_chain_put_explicitly_created(chain);
+ tcf_chain_put(chain);
+ }
+
+ block->refcnt--;
+ if (list_empty(&block->chain_list))
+ kfree(block);
+ } else {
+ block->refcnt--;
+ }
+}
+EXPORT_SYMBOL(tcf_block_put_ext);
+
+void tcf_block_put(struct tcf_block *block)
+{
+ struct tcf_block_ext_info ei = {0, };
+
+ if (!block)
+ return;
+ tcf_block_put_ext(block, block->q, &ei);
+}
+
+EXPORT_SYMBOL(tcf_block_put);
+
+struct tcf_block_cb {
+ struct list_head list;
+ tc_setup_cb_t *cb;
+ void *cb_ident;
+ void *cb_priv;
+ unsigned int refcnt;
+};
+
+void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
+{
+ return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(tcf_block_cb_priv);
+
+struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
+ tc_setup_cb_t *cb, void *cb_ident)
+{ struct tcf_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, &block->cb_list, list)
+ if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
+ return block_cb;
+ return NULL;
+}
+EXPORT_SYMBOL(tcf_block_cb_lookup);
+
+void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
+{
+ block_cb->refcnt++;
+}
+EXPORT_SYMBOL(tcf_block_cb_incref);
+
+unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
+{
+ return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(tcf_block_cb_decref);
+
+static int
+tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+ void *cb_priv, bool add, bool offload_in_use,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain *chain;
+ struct tcf_proto *tp;
+ int err;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ for (tp = rtnl_dereference(chain->filter_chain); tp;
+ tp = rtnl_dereference(tp->next)) {
+ if (tp->ops->reoffload) {
+ err = tp->ops->reoffload(tp, add, cb, cb_priv,
+ extack);
+ if (err && add)
+ goto err_playback_remove;
+ } else if (add && offload_in_use) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support");
+ goto err_playback_remove;
+ }
+ }
+ }
+
+ return 0;
+
+err_playback_remove:
+ tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
+ extack);
+ return err;
+}
+
+struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
+ tc_setup_cb_t *cb, void *cb_ident,
+ void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block_cb *block_cb;
+ int err;
+
+ /* Replay any already present rules */
+ err = tcf_block_playback_offloads(block, cb, cb_priv, true,
+ tcf_block_offload_in_use(block),
+ extack);
+ if (err)
+ return ERR_PTR(err);
+
+ block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+ if (!block_cb)
+ return ERR_PTR(-ENOMEM);
+ block_cb->cb = cb;
+ block_cb->cb_ident = cb_ident;
+ block_cb->cb_priv = cb_priv;
+ list_add(&block_cb->list, &block->cb_list);
+ return block_cb;
+}
+EXPORT_SYMBOL(__tcf_block_cb_register);
+
+int tcf_block_cb_register(struct tcf_block *block,
+ tc_setup_cb_t *cb, void *cb_ident,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct tcf_block_cb *block_cb;
+
+ block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
+ extack);
+ return PTR_ERR_OR_ZERO(block_cb);
+}
+EXPORT_SYMBOL(tcf_block_cb_register);
+
+void __tcf_block_cb_unregister(struct tcf_block *block,
+ struct tcf_block_cb *block_cb)
+{
+ tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
+ false, tcf_block_offload_in_use(block),
+ NULL);
+ list_del(&block_cb->list);
+ kfree(block_cb);
+}
+EXPORT_SYMBOL(__tcf_block_cb_unregister);
+
+void tcf_block_cb_unregister(struct tcf_block *block,
+ tc_setup_cb_t *cb, void *cb_ident)
+{
+ struct tcf_block_cb *block_cb;
+
+ block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
+ if (!block_cb)
+ return;
+ __tcf_block_cb_unregister(block, block_cb);
+}
+EXPORT_SYMBOL(tcf_block_cb_unregister);
+
+static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
+{
+ struct tcf_block_cb *block_cb;
+ int ok_count = 0;
+ int err;
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop)
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(block_cb, &block->cb_list, list) {
+ err = block_cb->cb(type, type_data, block_cb->cb_priv);
+ if (err) {
+ if (err_stop)
+ return err;
+ } else {
+ ok_count++;
+ }
+ }
+ return ok_count;
+}
+
+/* Main classifier routine: scans classifier chain attached
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
+ */
+int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ const int max_reclassify_loop = 4;
+ const struct tcf_proto *orig_tp = tp;
+ const struct tcf_proto *first_tp;
+ int limit = 0;
+
+reclassify:
+#endif
+ for (; tp; tp = rcu_dereference_bh(tp->next)) {
+ __be16 protocol = skb_protocol(skb, false);
+ int err;
+
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+
+ err = tp->classify(skb, tp, res);
+#ifdef CONFIG_NET_CLS_ACT
+ if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
+ first_tp = orig_tp;
+ goto reset;
+ } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
+ first_tp = res->goto_tp;
+ goto reset;
+ }
+#endif
+ if (err >= 0)
+ return err;
+ }
+
+ return TC_ACT_UNSPEC; /* signal: continue lookup */
+#ifdef CONFIG_NET_CLS_ACT
+reset:
+ if (unlikely(limit++ >= max_reclassify_loop)) {
+ net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n",
+ tp->chain->block->index,
+ tp->prio & 0xffff,
+ ntohs(tp->protocol));
+ return TC_ACT_SHOT;
+ }
+
+ tp = first_tp;
+ goto reclassify;
+#endif
+}
+EXPORT_SYMBOL(tcf_classify);
+
+struct tcf_chain_info {
+ struct tcf_proto __rcu **pprev;
+ struct tcf_proto __rcu *next;
+};
+
+static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info)
+{
+ return rtnl_dereference(*chain_info->pprev);
+}
+
+static void tcf_chain_tp_insert(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ struct tcf_proto *tp)
+{
+ if (*chain_info->pprev == chain->filter_chain)
+ tcf_chain0_head_change(chain, tp);
+ RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
+ rcu_assign_pointer(*chain_info->pprev, tp);
+ tcf_chain_hold(chain);
+}
+
+static void tcf_chain_tp_remove(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ struct tcf_proto *tp)
+{
+ struct tcf_proto *next = rtnl_dereference(chain_info->next);
+
+ if (tp == chain->filter_chain)
+ tcf_chain0_head_change(chain, next);
+ RCU_INIT_POINTER(*chain_info->pprev, next);
+ tcf_chain_put(chain);
+}
+
+static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ u32 protocol, u32 prio,
+ bool prio_allocate)
+{
+ struct tcf_proto **pprev;
+ struct tcf_proto *tp;
+
+ /* Check the chain for existence of proto-tcf with this priority */
+ for (pprev = &chain->filter_chain;
+ (tp = rtnl_dereference(*pprev)); pprev = &tp->next) {
+ if (tp->prio >= prio) {
+ if (tp->prio == prio) {
+ if (prio_allocate ||
+ (tp->protocol != protocol && protocol))
+ return ERR_PTR(-EINVAL);
+ } else {
+ tp = NULL;
+ }
+ break;
+ }
+ }
+ chain_info->pprev = pprev;
+ chain_info->next = tp ? tp->next : NULL;
+ return tp;
+}
+
+static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+ struct tcf_proto *tp, struct tcf_block *block,
+ struct Qdisc *q, u32 parent, void *fh,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ if (q) {
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = parent;
+ } else {
+ tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+ tcm->tcm_block_index = block->index;
+ }
+ tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+ if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
+ goto nla_put_failure;
+ if (!fh) {
+ tcm->tcm_handle = 0;
+ } else {
+ if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ goto nla_put_failure;
+ }
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ struct tcf_block *block, struct Qdisc *q,
+ u32 parent, void *fh, int event, bool unicast)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+ n->nlmsg_seq, n->nlmsg_flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ struct tcf_block *block, struct Qdisc *q,
+ u32 parent, void *fh, bool unicast, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+ n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+ NL_SET_ERR_MSG(extack, "Failed to build del event notification");
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = tp->ops->delete(tp, fh, last, extack);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send filter delete notification");
+ return err;
+}
+
+static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
+ struct tcf_block *block, struct Qdisc *q,
+ u32 parent, struct nlmsghdr *n,
+ struct tcf_chain *chain, int event)
+{
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next))
+ tfilter_notify(net, oskb, n, tp, block,
+ q, parent, NULL, event, false);
+}
+
+static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ bool prio_allocate;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain_info chain_info;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ struct tcf_proto *tp;
+ unsigned long cl;
+ void *fh;
+ int err;
+ int tp_created;
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ tp_created = 0;
+
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ prio_allocate = false;
+ parent = t->tcm_parent;
+ cl = 0;
+
+ if (prio == 0) {
+ /* If no priority is provided by the user,
+ * we allocate one.
+ */
+ if (n->nlmsg_flags & NLM_F_CREATE) {
+ prio = TC_H_MAKE(0x80000000U, 0U);
+ prio_allocate = true;
+ } else {
+ NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
+ return -ENOENT;
+ }
+ }
+
+ /* Find head of filter chain. */
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
+ }
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout;
+ }
+ chain = tcf_chain_get(block, chain_index, true);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Cannot create specified filter chain");
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+ prio, prio_allocate);
+ if (IS_ERR(tp)) {
+ NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+ err = PTR_ERR(tp);
+ goto errout;
+ }
+
+ if (tp == NULL) {
+ /* Proto-tcf does not exist, create new one */
+
+ if (tca[TCA_KIND] == NULL || !protocol) {
+ NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (prio_allocate)
+ prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
+
+ tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
+ protocol, prio, chain, extack);
+ if (IS_ERR(tp)) {
+ err = PTR_ERR(tp);
+ goto errout;
+ }
+ tp_created = 1;
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (!fh) {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
+ err = -ENOENT;
+ goto errout;
+ }
+ } else if (n->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_ERR_MSG(extack, "Filter already exists");
+ err = -EEXIST;
+ goto errout;
+ }
+
+ if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
+ NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
+ n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
+ extack);
+ if (err == 0) {
+ if (tp_created)
+ tcf_chain_tp_insert(chain, &chain_info, tp);
+ tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+ RTM_NEWTFILTER, false);
+ /* q pointer is NULL for shared blocks */
+ if (q)
+ q->flags &= ~TCQ_F_CAN_BYPASS;
+ } else {
+ if (tp_created)
+ tcf_proto_destroy(tp, NULL);
+ }
+
+errout:
+ if (chain)
+ tcf_chain_put(chain);
+ tcf_block_release(q, block);
+ if (err == -EAGAIN)
+ /* Replay the request. */
+ goto replay;
+ return err;
+}
+
+static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain_info chain_info;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ struct tcf_proto *tp = NULL;
+ unsigned long cl = 0;
+ void *fh = NULL;
+ int err;
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ parent = t->tcm_parent;
+
+ if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) {
+ NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
+ return -ENOENT;
+ }
+
+ /* Find head of filter chain. */
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
+ }
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout;
+ }
+ chain = tcf_chain_get(block, chain_index, false);
+ if (!chain) {
+ /* User requested flush on non-existent chain. Nothing to do,
+ * so just return success.
+ */
+ if (prio == 0) {
+ err = 0;
+ goto errout;
+ }
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (prio == 0) {
+ tfilter_notify_chain(net, skb, block, q, parent, n,
+ chain, RTM_DELTFILTER);
+ tcf_chain_flush(chain);
+ err = 0;
+ goto errout;
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+ prio, false);
+ if (!tp || IS_ERR(tp)) {
+ NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+ err = tp ? PTR_ERR(tp) : -ENOENT;
+ goto errout;
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (!fh) {
+ if (t->tcm_handle == 0) {
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, false);
+ tcf_proto_destroy(tp, extack);
+ err = 0;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
+ }
+ } else {
+ bool last;
+
+ err = tfilter_del_notify(net, skb, n, tp, block,
+ q, parent, fh, false, &last,
+ extack);
+ if (err)
+ goto errout;
+ if (last) {
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ tcf_proto_destroy(tp, extack);
+ }
+ }
+
+errout:
+ if (chain)
+ tcf_chain_put(chain);
+ tcf_block_release(q, block);
+ return err;
+}
+
+static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain_info chain_info;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ struct tcf_proto *tp = NULL;
+ unsigned long cl = 0;
+ void *fh = NULL;
+ int err;
+
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ parent = t->tcm_parent;
+
+ if (prio == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
+ return -ENOENT;
+ }
+
+ /* Find head of filter chain. */
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
+ }
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout;
+ }
+ chain = tcf_chain_get(block, chain_index, false);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+ prio, false);
+ if (!tp || IS_ERR(tp)) {
+ NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+ err = tp ? PTR_ERR(tp) : -ENOENT;
+ goto errout;
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (!fh) {
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
+ } else {
+ err = tfilter_notify(net, skb, n, tp, block, q, parent,
+ fh, RTM_NEWTFILTER, true);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
+ }
+
+errout:
+ if (chain)
+ tcf_chain_put(chain);
+ tcf_block_release(q, block);
+ return err;
+}
+
+struct tcf_dump_args {
+ struct tcf_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ struct tcf_block *block;
+ struct Qdisc *q;
+ u32 parent;
+};
+
+static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
+{
+ struct tcf_dump_args *a = (void *)arg;
+ struct net *net = sock_net(a->skb->sk);
+
+ return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
+ n, NETLINK_CB(a->cb->skb).portid,
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER);
+}
+
+static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
+ struct sk_buff *skb, struct netlink_callback *cb,
+ long index_start, long *p_index)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcf_block *block = chain->block;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct tcf_dump_args arg;
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next), (*p_index)++) {
+ if (*p_index < index_start)
+ continue;
+ if (TC_H_MAJ(tcm->tcm_info) &&
+ TC_H_MAJ(tcm->tcm_info) != tp->prio)
+ continue;
+ if (TC_H_MIN(tcm->tcm_info) &&
+ TC_H_MIN(tcm->tcm_info) != tp->protocol)
+ continue;
+ if (*p_index > index_start)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (cb->args[1] == 0) {
+ if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER) <= 0)
+ return false;
+
+ cb->args[1] = 1;
+ }
+ if (!tp->ops->walk)
+ continue;
+ arg.w.fn = tcf_node_dump;
+ arg.skb = skb;
+ arg.cb = cb;
+ arg.block = block;
+ arg.q = q;
+ arg.parent = parent;
+ arg.w.stop = 0;
+ arg.w.skip = cb->args[1] - 1;
+ arg.w.count = 0;
+ arg.w.cookie = cb->args[2];
+ tp->ops->walk(tp, &arg.w);
+ cb->args[2] = arg.w.cookie;
+ cb->args[1] = arg.w.count + 1;
+ if (arg.w.stop)
+ return false;
+ }
+ return true;
+}
+
+/* called with RTNL */
+static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct Qdisc *q = NULL;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ long index_start;
+ long index;
+ u32 parent;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return skb->len;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+ if (err)
+ return err;
+
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_lookup(net, tcm->tcm_block_index);
+ if (!block)
+ goto out;
+ /* If we work with block index, q is NULL and parent value
+ * will never be used in the following code. The check
+ * in tcf_fill_node prevents it. However, compiler does not
+ * see that far, so set parent to zero to silence the warning
+ * about parent being uninitialized.
+ */
+ parent = 0;
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+ unsigned long cl = 0;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return skb->len;
+
+ parent = tcm->tcm_parent;
+ if (!parent) {
+ q = dev->qdisc;
+ parent = q->handle;
+ } else {
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ }
+ if (!q)
+ goto out;
+ cops = q->ops->cl_ops;
+ if (!cops)
+ goto out;
+ if (!cops->tcf_block)
+ goto out;
+ if (TC_H_MIN(tcm->tcm_parent)) {
+ cl = cops->find(q, tcm->tcm_parent);
+ if (cl == 0)
+ goto out;
+ }
+ block = cops->tcf_block(q, cl, NULL);
+ if (!block)
+ goto out;
+ if (tcf_block_shared(block))
+ q = NULL;
+ }
+
+ index_start = cb->args[0];
+ index = 0;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ if (tca[TCA_CHAIN] &&
+ nla_get_u32(tca[TCA_CHAIN]) != chain->index)
+ continue;
+ if (!tcf_chain_dump(chain, q, parent, skb, cb,
+ index_start, &index)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ }
+
+ cb->args[0] = index;
+
+out:
+ /* If we did no progress, the error (EMSGSIZE) is real */
+ if (skb->len == 0 && err)
+ return err;
+ return skb->len;
+}
+
+static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
+ struct sk_buff *skb, struct tcf_block *block,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_proto_ops *ops;
+ struct nlmsghdr *nlh;
+ struct tcmsg *tcm;
+ void *priv;
+
+ ops = chain->tmplt_ops;
+ priv = chain->tmplt_priv;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_handle = 0;
+ if (block->q) {
+ tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex;
+ tcm->tcm_parent = block->q->handle;
+ } else {
+ tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+ tcm->tcm_block_index = block->index;
+ }
+
+ if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+ goto nla_put_failure;
+
+ if (ops) {
+ if (nla_put_string(skb, TCA_KIND, ops->kind))
+ goto nla_put_failure;
+ if (ops->tmplt_dump(skb, net, priv) < 0)
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct tcf_block *block = chain->block;
+ struct net *net = block->net;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_chain_fill_node(chain, net, skb, block, portid,
+ seq, flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
+}
+
+static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+ void *tmplt_priv;
+
+ /* If kind is not set, user did not specify template. */
+ if (!tca[TCA_KIND])
+ return 0;
+
+ ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
+ NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+ return -EOPNOTSUPP;
+ }
+
+ tmplt_priv = ops->tmplt_create(net, chain, tca, extack);
+ if (IS_ERR(tmplt_priv)) {
+ module_put(ops->owner);
+ return PTR_ERR(tmplt_priv);
+ }
+ chain->tmplt_ops = ops;
+ chain->tmplt_priv = tmplt_priv;
+ return 0;
+}
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain)
+{
+ const struct tcf_proto_ops *ops = chain->tmplt_ops;
+
+ /* If template ops are set, no work to do for us. */
+ if (!ops)
+ return;
+
+ ops->tmplt_destroy(chain->tmplt_priv);
+ module_put(ops->owner);
+}
+
+/* Add/delete/get a chain */
+
+static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ unsigned long cl;
+ int err;
+
+ if (n->nlmsg_type != RTM_GETCHAIN &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ parent = t->tcm_parent;
+ cl = 0;
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block))
+ return PTR_ERR(block);
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout_block;
+ }
+ chain = tcf_chain_lookup(block, chain_index);
+ if (n->nlmsg_type == RTM_NEWCHAIN) {
+ if (chain) {
+ if (tcf_chain_held_by_acts_only(chain)) {
+ /* The chain exists only because there is
+ * some action referencing it.
+ */
+ tcf_chain_hold(chain);
+ } else {
+ NL_SET_ERR_MSG(extack, "Filter chain already exists");
+ err = -EEXIST;
+ goto errout_block;
+ }
+ } else {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
+ err = -ENOENT;
+ goto errout_block;
+ }
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Failed to create filter chain");
+ err = -ENOMEM;
+ goto errout_block;
+ }
+ }
+ } else {
+ if (!chain || tcf_chain_held_by_acts_only(chain)) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -EINVAL;
+ goto errout_block;
+ }
+ tcf_chain_hold(chain);
+ }
+
+ switch (n->nlmsg_type) {
+ case RTM_NEWCHAIN:
+ err = tc_chain_tmplt_add(chain, net, tca, extack);
+ if (err)
+ goto errout;
+ /* In case the chain was successfully added, take a reference
+ * to the chain. This ensures that an empty chain
+ * does not disappear at the end of this function.
+ */
+ tcf_chain_hold(chain);
+ chain->explicitly_created = true;
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+ break;
+ case RTM_DELCHAIN:
+ tfilter_notify_chain(net, skb, block, q, parent, n,
+ chain, RTM_DELTFILTER);
+ /* Flush the chain first as the user requested chain removal. */
+ tcf_chain_flush(chain);
+ /* In case the chain was successfully deleted, put a reference
+ * to the chain previously taken during addition.
+ */
+ tcf_chain_put_explicitly_created(chain);
+ chain->explicitly_created = false;
+ break;
+ case RTM_GETCHAIN:
+ err = tc_chain_notify(chain, skb, n->nlmsg_seq,
+ n->nlmsg_flags, n->nlmsg_type, true);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Unsupported message type");
+ goto errout;
+ }
+
+errout:
+ tcf_chain_put(chain);
+errout_block:
+ tcf_block_release(q, block);
+ if (err == -EAGAIN)
+ /* Replay the request. */
+ goto replay;
+ return err;
+}
+
+/* called with RTNL */
+static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct Qdisc *q = NULL;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ long index_start;
+ long index;
+ u32 parent;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return skb->len;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ NULL);
+ if (err)
+ return err;
+
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_lookup(net, tcm->tcm_block_index);
+ if (!block)
+ goto out;
+ /* If we work with block index, q is NULL and parent value
+ * will never be used in the following code. The check
+ * in tcf_fill_node prevents it. However, compiler does not
+ * see that far, so set parent to zero to silence the warning
+ * about parent being uninitialized.
+ */
+ parent = 0;
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+ unsigned long cl = 0;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return skb->len;
+
+ parent = tcm->tcm_parent;
+ if (!parent) {
+ q = dev->qdisc;
+ parent = q->handle;
+ } else {
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ }
+ if (!q)
+ goto out;
+ cops = q->ops->cl_ops;
+ if (!cops)
+ goto out;
+ if (!cops->tcf_block)
+ goto out;
+ if (TC_H_MIN(tcm->tcm_parent)) {
+ cl = cops->find(q, tcm->tcm_parent);
+ if (cl == 0)
+ goto out;
+ }
+ block = cops->tcf_block(q, cl, NULL);
+ if (!block)
+ goto out;
+ if (tcf_block_shared(block))
+ q = NULL;
+ }
+
+ index_start = cb->args[0];
+ index = 0;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ if ((tca[TCA_CHAIN] &&
+ nla_get_u32(tca[TCA_CHAIN]) != chain->index))
+ continue;
+ if (index < index_start) {
+ index++;
+ continue;
+ }
+ if (tcf_chain_held_by_acts_only(chain))
+ continue;
+ err = tc_chain_fill_node(chain, net, skb, block,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWCHAIN);
+ if (err <= 0)
+ break;
+ index++;
+ }
+
+ cb->args[0] = index;
+
+out:
+ /* If we did no progress, the error (EMSGSIZE) is real */
+ if (skb->len == 0 && err)
+ return err;
+ return skb->len;
+}
+
+void tcf_exts_destroy(struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ if (exts->actions) {
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ }
+ exts->nr_actions = 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_destroy);
+
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ {
+ struct tc_action *act;
+ size_t attr_size = 0;
+
+ if (exts->police && tb[exts->police]) {
+ act = tcf_action_init_1(net, tp, tb[exts->police],
+ rate_tlv, "police", ovr,
+ TCA_ACT_BIND, true, extack);
+ if (IS_ERR(act))
+ return PTR_ERR(act);
+
+ act->type = exts->type = TCA_OLD_COMPAT;
+ exts->actions[0] = act;
+ exts->nr_actions = 1;
+ } else if (exts->action && tb[exts->action]) {
+ int err;
+
+ err = tcf_action_init(net, tp, tb[exts->action],
+ rate_tlv, NULL, ovr, TCA_ACT_BIND,
+ exts->actions, &attr_size, true,
+ extack);
+ if (err < 0)
+ return err;
+ exts->nr_actions = err;
+ }
+ exts->net = net;
+ }
+#else
+ if ((exts->action && tb[exts->action]) ||
+ (exts->police && tb[exts->police])) {
+ NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)");
+ return -EOPNOTSUPP;
+ }
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_exts_validate);
+
+void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct tcf_exts old = *dst;
+
+ *dst = *src;
+ tcf_exts_destroy(&old);
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_change);
+
+#ifdef CONFIG_NET_CLS_ACT
+static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
+{
+ if (exts->nr_actions == 0)
+ return NULL;
+ else
+ return exts->actions[0];
+}
+#endif
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct nlattr *nest;
+
+ if (exts->action && tcf_exts_has_actions(exts)) {
+ /*
+ * again for backward compatible mode - we want
+ * to work with both old and new modes of entering
+ * tc data even if iproute2 was newer - jhs
+ */
+ if (exts->type != TCA_OLD_COMPAT) {
+ nest = nla_nest_start(skb, exts->action);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ } else if (exts->police) {
+ struct tc_action *act = tcf_exts_first_act(exts);
+ nest = nla_nest_start(skb, exts->police);
+ if (nest == NULL || !act)
+ goto nla_put_failure;
+ if (tcf_action_dump_old(skb, act, 0, 0) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ }
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_dump);
+
+
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct tc_action *a = tcf_exts_first_act(exts);
+ if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0)
+ return -1;
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(tcf_exts_dump_stats);
+
+static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
+ enum tc_setup_type type,
+ void *type_data, bool err_stop)
+{
+ int ok_count = 0;
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tc_action *a;
+ struct net_device *dev;
+ int i, ret;
+
+ if (!tcf_exts_has_actions(exts))
+ return 0;
+
+ for (i = 0; i < exts->nr_actions; i++) {
+ a = exts->actions[i];
+ if (!a->ops->get_dev)
+ continue;
+ dev = a->ops->get_dev(a);
+ if (!dev)
+ continue;
+ ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
+ a->ops->put_dev(dev);
+ if (ret < 0)
+ return ret;
+ ok_count += ret;
+ }
+#endif
+ return ok_count;
+}
+
+int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
+ enum tc_setup_type type, void *type_data, bool err_stop)
+{
+ int ok_count;
+ int ret;
+
+ ret = tcf_block_cb_call(block, type, type_data, err_stop);
+ if (ret < 0)
+ return ret;
+ ok_count = ret;
+
+ if (!exts || ok_count)
+ return ok_count;
+ ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
+ if (ret < 0)
+ return ret;
+ ok_count += ret;
+
+ return ok_count;
+}
+EXPORT_SYMBOL(tc_setup_cb_call);
+
+static __net_init int tcf_net_init(struct net *net)
+{
+ struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+ idr_init(&tn->idr);
+ return 0;
+}
+
+static void __net_exit tcf_net_exit(struct net *net)
+{
+ struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+ idr_destroy(&tn->idr);
+}
+
+static struct pernet_operations tcf_net_ops = {
+ .init = tcf_net_init,
+ .exit = tcf_net_exit,
+ .id = &tcf_net_id,
+ .size = sizeof(struct tcf_net),
+};
+
+static int __init tc_filter_init(void)
+{
+ int err;
+
+ tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
+ if (!tc_filter_wq)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&tcf_net_ops);
+ if (err)
+ goto err_register_pernet_subsys;
+
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
+ tc_dump_tfilter, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
+ tc_dump_chain, 0);
+
+ return 0;
+
+err_register_pernet_subsys:
+ destroy_workqueue(tc_filter_wq);
+ return err;
+}
+
+subsys_initcall(tc_filter_init);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
new file mode 100644
index 000000000..14098da69
--- /dev/null
+++ b/net/sched/cls_basic.c
@@ -0,0 +1,331 @@
+/*
+ * net/sched/cls_basic.c Basic Packet Classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/idr.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct basic_head {
+ struct list_head flist;
+ struct idr handle_idr;
+ struct rcu_head rcu;
+};
+
+struct basic_filter {
+ u32 handle;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_result res;
+ struct tcf_proto *tp;
+ struct list_head link;
+ struct rcu_work rwork;
+};
+
+static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ int r;
+ struct basic_head *head = rcu_dereference_bh(tp->root);
+ struct basic_filter *f;
+
+ list_for_each_entry_rcu(f, &head->flist, link) {
+ if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+ continue;
+ *res = f->res;
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+ return r;
+ }
+ return -1;
+}
+
+static void *basic_get(struct tcf_proto *tp, u32 handle)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (f->handle == handle) {
+ return f;
+ }
+ }
+
+ return NULL;
+}
+
+static int basic_init(struct tcf_proto *tp)
+{
+ struct basic_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ INIT_LIST_HEAD(&head->flist);
+ idr_init(&head->handle_idr);
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void __basic_delete_filter(struct basic_filter *f)
+{
+ tcf_exts_destroy(&f->exts);
+ tcf_em_tree_destroy(&f->ematches);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void basic_delete_filter_work(struct work_struct *work)
+{
+ struct basic_filter *f = container_of(to_rcu_work(work),
+ struct basic_filter,
+ rwork);
+ rtnl_lock();
+ __basic_delete_filter(f);
+ rtnl_unlock();
+}
+
+static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f, *n;
+
+ list_for_each_entry_safe(f, n, &head->flist, link) {
+ list_del_rcu(&f->link);
+ tcf_unbind_filter(tp, &f->res);
+ idr_remove(&head->handle_idr, f->handle);
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, basic_delete_filter_work);
+ else
+ __basic_delete_filter(f);
+ }
+ idr_destroy(&head->handle_idr);
+ kfree_rcu(head, rcu);
+}
+
+static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f = arg;
+
+ list_del_rcu(&f->link);
+ tcf_unbind_filter(tp, &f->res);
+ idr_remove(&head->handle_idr, f->handle);
+ tcf_exts_get_net(&f->exts);
+ tcf_queue_work(&f->rwork, basic_delete_filter_work);
+ *last = list_empty(&head->flist);
+ return 0;
+}
+
+static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
+ [TCA_BASIC_CLASSID] = { .type = NLA_U32 },
+ [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
+};
+
+static int basic_set_parms(struct net *net, struct tcf_proto *tp,
+ struct basic_filter *f, unsigned long base,
+ struct nlattr **tb,
+ struct nlattr *est, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ if (err < 0)
+ return err;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &f->ematches);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_BASIC_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ f->tp = tp;
+ return 0;
+}
+
+static int basic_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, void **arg, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct nlattr *tb[TCA_BASIC_MAX + 1];
+ struct basic_filter *fold = (struct basic_filter *) *arg;
+ struct basic_filter *fnew;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+ basic_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (fold != NULL) {
+ if (handle && fold->handle != handle)
+ return -EINVAL;
+ }
+
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ if (err < 0)
+ goto errout;
+
+ if (!handle) {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ INT_MAX, GFP_KERNEL);
+ } else if (!fold) {
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ handle, GFP_KERNEL);
+ }
+ if (err)
+ goto errout;
+ fnew->handle = handle;
+
+ err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
+ extack);
+ if (err < 0) {
+ if (!fold)
+ idr_remove(&head->handle_idr, fnew->handle);
+ goto errout;
+ }
+
+ *arg = fnew;
+
+ if (fold) {
+ idr_replace(&head->handle_idr, fnew, fnew->handle);
+ list_replace_rcu(&fold->link, &fnew->link);
+ tcf_unbind_filter(tp, &fold->res);
+ tcf_exts_get_net(&fold->exts);
+ tcf_queue_work(&fold->rwork, basic_delete_filter_work);
+ } else {
+ list_add_rcu(&fnew->link, &head->flist);
+ }
+
+ return 0;
+errout:
+ tcf_exts_destroy(&fnew->exts);
+ kfree(fnew);
+ return err;
+}
+
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct basic_filter *f = fh;
+
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct basic_filter *f = fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0 ||
+ tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_basic_ops __read_mostly = {
+ .kind = "basic",
+ .classify = basic_classify,
+ .init = basic_init,
+ .destroy = basic_destroy,
+ .get = basic_get,
+ .change = basic_change,
+ .delete = basic_delete,
+ .walk = basic_walk,
+ .dump = basic_dump,
+ .bind_class = basic_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_basic(void)
+{
+ return register_tcf_proto_ops(&cls_basic_ops);
+}
+
+static void __exit exit_basic(void)
+{
+ unregister_tcf_proto_ops(&cls_basic_ops);
+}
+
+module_init(init_basic)
+module_exit(exit_basic)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 000000000..5d100126c
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,722 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/idr.h>
+
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+#define CLS_BPF_NAME_LEN 256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS \
+ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
+
+struct cls_bpf_head {
+ struct list_head plist;
+ struct idr handle_idr;
+ struct rcu_head rcu;
+};
+
+struct cls_bpf_prog {
+ struct bpf_prog *filter;
+ struct list_head link;
+ struct tcf_result res;
+ bool exts_integrated;
+ u32 gen_flags;
+ unsigned int in_hw_count;
+ struct tcf_exts exts;
+ u32 handle;
+ u16 bpf_num_ops;
+ struct sock_filter *bpf_ops;
+ const char *bpf_name;
+ struct tcf_proto *tp;
+ struct rcu_work rwork;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+ [TCA_BPF_CLASSID] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 },
+ [TCA_BPF_FD] = { .type = NLA_U32 },
+ [TCA_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = CLS_BPF_NAME_LEN },
+ [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int cls_bpf_exec_opcode(int code)
+{
+ switch (code) {
+ case TC_ACT_OK:
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ case TC_ACT_REDIRECT:
+ case TC_ACT_UNSPEC:
+ return code;
+ default:
+ return TC_ACT_UNSPEC;
+ }
+}
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
+ bool at_ingress = skb_at_tc_ingress(skb);
+ struct cls_bpf_prog *prog;
+ int ret = -1;
+
+ /* Needed here for accessing maps. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(prog, &head->plist, link) {
+ int filter_res;
+
+ qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
+
+ if (tc_skip_sw(prog->gen_flags)) {
+ filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
+ } else if (at_ingress) {
+ /* It is safe to push/pull even if skb_shared() */
+ __skb_push(skb, skb->mac_len);
+ bpf_compute_data_pointers(skb);
+ filter_res = BPF_PROG_RUN(prog->filter, skb);
+ __skb_pull(skb, skb->mac_len);
+ } else {
+ bpf_compute_data_pointers(skb);
+ filter_res = BPF_PROG_RUN(prog->filter, skb);
+ }
+
+ if (prog->exts_integrated) {
+ res->class = 0;
+ res->classid = TC_H_MAJ(prog->res.classid) |
+ qdisc_skb_cb(skb)->tc_classid;
+
+ ret = cls_bpf_exec_opcode(filter_res);
+ if (ret == TC_ACT_UNSPEC)
+ continue;
+ break;
+ }
+
+ if (filter_res == 0)
+ continue;
+ if (filter_res != -1) {
+ res->class = 0;
+ res->classid = filter_res;
+ } else {
+ *res = prog->res;
+ }
+
+ ret = tcf_exts_exec(skb, &prog->exts, res);
+ if (ret < 0)
+ continue;
+
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
+{
+ return !prog->bpf_ops;
+}
+
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct cls_bpf_prog *oldprog,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_bpf_offload cls_bpf = {};
+ struct cls_bpf_prog *obj;
+ bool skip_sw;
+ int err;
+
+ skip_sw = prog && tc_skip_sw(prog->gen_flags);
+ obj = prog ?: oldprog;
+
+ tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags,
+ extack);
+ cls_bpf.command = TC_CLSBPF_OFFLOAD;
+ cls_bpf.exts = &obj->exts;
+ cls_bpf.prog = prog ? prog->filter : NULL;
+ cls_bpf.oldprog = oldprog ? oldprog->filter : NULL;
+ cls_bpf.name = obj->bpf_name;
+ cls_bpf.exts_integrated = obj->exts_integrated;
+
+ if (oldprog)
+ tcf_block_offload_dec(block, &oldprog->gen_flags);
+
+ err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
+ if (prog) {
+ if (err < 0) {
+ cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+ return err;
+ } else if (err > 0) {
+ prog->in_hw_count = err;
+ tcf_block_offload_inc(block, &prog->gen_flags);
+ }
+ }
+
+ if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u32 cls_bpf_flags(u32 flags)
+{
+ return flags & CLS_BPF_SUPPORTED_GEN_FLAGS;
+}
+
+static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct cls_bpf_prog *oldprog,
+ struct netlink_ext_ack *extack)
+{
+ if (prog && oldprog &&
+ cls_bpf_flags(prog->gen_flags) !=
+ cls_bpf_flags(oldprog->gen_flags))
+ return -EINVAL;
+
+ if (prog && tc_skip_hw(prog->gen_flags))
+ prog = NULL;
+ if (oldprog && tc_skip_hw(oldprog->gen_flags))
+ oldprog = NULL;
+ if (!prog && !oldprog)
+ return 0;
+
+ return cls_bpf_offload_cmd(tp, prog, oldprog, extack);
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = cls_bpf_offload_cmd(tp, NULL, prog, extack);
+ if (err)
+ pr_err("Stopping hardware offload failed: %d\n", err);
+}
+
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_bpf_offload cls_bpf = {};
+
+ tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, NULL);
+ cls_bpf.command = TC_CLSBPF_STATS;
+ cls_bpf.exts = &prog->exts;
+ cls_bpf.prog = prog->filter;
+ cls_bpf.name = prog->bpf_name;
+ cls_bpf.exts_integrated = prog->exts_integrated;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD_RCU(&head->plist);
+ idr_init(&head->handle_idr);
+ rcu_assign_pointer(tp->root, head);
+
+ return 0;
+}
+
+static void cls_bpf_free_parms(struct cls_bpf_prog *prog)
+{
+ if (cls_bpf_is_ebpf(prog))
+ bpf_prog_put(prog->filter);
+ else
+ bpf_prog_destroy(prog->filter);
+
+ kfree(prog->bpf_name);
+ kfree(prog->bpf_ops);
+}
+
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
+{
+ tcf_exts_destroy(&prog->exts);
+ tcf_exts_put_net(&prog->exts);
+
+ cls_bpf_free_parms(prog);
+ kfree(prog);
+}
+
+static void cls_bpf_delete_prog_work(struct work_struct *work)
+{
+ struct cls_bpf_prog *prog = container_of(to_rcu_work(work),
+ struct cls_bpf_prog,
+ rwork);
+ rtnl_lock();
+ __cls_bpf_delete_prog(prog);
+ rtnl_unlock();
+}
+
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+
+ idr_remove(&head->handle_idr, prog->handle);
+ cls_bpf_stop_offload(tp, prog, extack);
+ list_del_rcu(&prog->link);
+ tcf_unbind_filter(tp, &prog->res);
+ if (tcf_exts_get_net(&prog->exts))
+ tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work);
+ else
+ __cls_bpf_delete_prog(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+
+ __cls_bpf_delete(tp, arg, extack);
+ *last = list_empty(&head->plist);
+ return 0;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog, *tmp;
+
+ list_for_each_entry_safe(prog, tmp, &head->plist, link)
+ __cls_bpf_delete(tp, prog, extack);
+
+ idr_destroy(&head->handle_idr);
+ kfree_rcu(head, rcu);
+}
+
+static void *cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog->handle == handle)
+ return prog;
+ }
+
+ return NULL;
+}
+
+static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
+{
+ struct sock_filter *bpf_ops;
+ struct sock_fprog_kern fprog_tmp;
+ struct bpf_prog *fp;
+ u16 bpf_size, bpf_num_ops;
+ int ret;
+
+ bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+ if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
+ return -EINVAL;
+
+ bpf_size = bpf_num_ops * sizeof(*bpf_ops);
+ if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
+ return -EINVAL;
+
+ bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL)
+ return -ENOMEM;
+
+ fprog_tmp.len = bpf_num_ops;
+ fprog_tmp.filter = bpf_ops;
+
+ ret = bpf_prog_create(&fp, &fprog_tmp);
+ if (ret < 0) {
+ kfree(bpf_ops);
+ return ret;
+ }
+
+ prog->bpf_ops = bpf_ops;
+ prog->bpf_num_ops = bpf_num_ops;
+ prog->bpf_name = NULL;
+ prog->filter = fp;
+
+ return 0;
+}
+
+static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
+ u32 gen_flags, const struct tcf_proto *tp)
+{
+ struct bpf_prog *fp;
+ char *name = NULL;
+ bool skip_sw;
+ u32 bpf_fd;
+
+ bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+ skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW;
+
+ fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ if (tb[TCA_BPF_NAME]) {
+ name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
+ if (!name) {
+ bpf_prog_put(fp);
+ return -ENOMEM;
+ }
+ }
+
+ prog->bpf_ops = NULL;
+ prog->bpf_name = name;
+ prog->filter = fp;
+
+ if (fp->dst_needed)
+ tcf_block_netif_keep_dst(tp->chain->block);
+
+ return 0;
+}
+
+static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
+ struct cls_bpf_prog *prog, unsigned long base,
+ struct nlattr **tb, struct nlattr *est, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ bool is_bpf, is_ebpf, have_exts = false;
+ u32 gen_flags = 0;
+ int ret;
+
+ is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
+ is_ebpf = tb[TCA_BPF_FD];
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
+ return -EINVAL;
+
+ ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[TCA_BPF_FLAGS]) {
+ u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
+
+ if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT)
+ return -EINVAL;
+
+ have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
+ }
+ if (tb[TCA_BPF_FLAGS_GEN]) {
+ gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+ if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+ !tc_flags_valid(gen_flags))
+ return -EINVAL;
+ }
+
+ prog->exts_integrated = have_exts;
+ prog->gen_flags = gen_flags;
+
+ ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
+ cls_bpf_prog_from_efd(tb, prog, gen_flags, tp);
+ if (ret < 0)
+ return ret;
+
+ if (tb[TCA_BPF_CLASSID]) {
+ prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ tcf_bind_filter(tp, &prog->res, base);
+ }
+
+ return 0;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ void **arg, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *oldprog = *arg;
+ struct nlattr *tb[TCA_BPF_MAX + 1];
+ struct cls_bpf_prog *prog;
+ int ret;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+ if (!prog)
+ return -ENOBUFS;
+
+ ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ if (ret < 0)
+ goto errout;
+
+ if (oldprog) {
+ if (handle && oldprog->handle != handle) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
+
+ if (handle == 0) {
+ handle = 1;
+ ret = idr_alloc_u32(&head->handle_idr, prog, &handle,
+ INT_MAX, GFP_KERNEL);
+ } else if (!oldprog) {
+ ret = idr_alloc_u32(&head->handle_idr, prog, &handle,
+ handle, GFP_KERNEL);
+ }
+
+ if (ret)
+ goto errout;
+ prog->handle = handle;
+
+ ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr,
+ extack);
+ if (ret < 0)
+ goto errout_idr;
+
+ ret = cls_bpf_offload(tp, prog, oldprog, extack);
+ if (ret)
+ goto errout_parms;
+
+ if (!tc_in_hw(prog->gen_flags))
+ prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
+ if (oldprog) {
+ idr_replace(&head->handle_idr, prog, handle);
+ list_replace_rcu(&oldprog->link, &prog->link);
+ tcf_unbind_filter(tp, &oldprog->res);
+ tcf_exts_get_net(&oldprog->exts);
+ tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work);
+ } else {
+ list_add_rcu(&prog->link, &head->plist);
+ }
+
+ *arg = prog;
+ return 0;
+
+errout_parms:
+ cls_bpf_free_parms(prog);
+errout_idr:
+ if (!oldprog)
+ idr_remove(&head->handle_idr, prog->handle);
+errout:
+ tcf_exts_destroy(&prog->exts);
+ kfree(prog);
+ return ret;
+}
+
+static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ return 0;
+}
+
+static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (prog->bpf_name &&
+ nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_BPF_ID, prog->filter->aux->id))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
+ return 0;
+}
+
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *tm)
+{
+ struct cls_bpf_prog *prog = fh;
+ struct nlattr *nest;
+ u32 bpf_flags = 0;
+ int ret;
+
+ if (prog == NULL)
+ return skb->len;
+
+ tm->tcm_handle = prog->handle;
+
+ cls_bpf_offload_update_stats(tp, prog);
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (prog->res.classid &&
+ nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+ goto nla_put_failure;
+
+ if (cls_bpf_is_ebpf(prog))
+ ret = cls_bpf_dump_ebpf_info(prog, skb);
+ else
+ ret = cls_bpf_dump_bpf_info(prog, skb);
+ if (ret)
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ if (prog->exts_integrated)
+ bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
+ if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
+ goto nla_put_failure;
+ if (prog->gen_flags &&
+ nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
+{
+ struct cls_bpf_prog *prog = fh;
+
+ if (prog && prog->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &prog->res, base);
+ else
+ __tcf_unbind_filter(q, &prog->res);
+ }
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, prog, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_bpf_offload cls_bpf = {};
+ struct cls_bpf_prog *prog;
+ int err;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (tc_skip_hw(prog->gen_flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags,
+ extack);
+ cls_bpf.command = TC_CLSBPF_OFFLOAD;
+ cls_bpf.exts = &prog->exts;
+ cls_bpf.prog = add ? prog->filter : NULL;
+ cls_bpf.oldprog = add ? NULL : prog->filter;
+ cls_bpf.name = prog->bpf_name;
+ cls_bpf.exts_integrated = prog->exts_integrated;
+
+ err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(prog->gen_flags))
+ return err;
+ continue;
+ }
+
+ tc_cls_offload_cnt_update(block, &prog->in_hw_count,
+ &prog->gen_flags, add);
+ }
+
+ return 0;
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .owner = THIS_MODULE,
+ .classify = cls_bpf_classify,
+ .init = cls_bpf_init,
+ .destroy = cls_bpf_destroy,
+ .get = cls_bpf_get,
+ .change = cls_bpf_change,
+ .delete = cls_bpf_delete,
+ .walk = cls_bpf_walk,
+ .reoffload = cls_bpf_reoffload,
+ .dump = cls_bpf_dump,
+ .bind_class = cls_bpf_bind_class,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+ return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+ unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
new file mode 100644
index 000000000..3bc01bdde
--- /dev/null
+++ b/net/sched/cls_cgroup.c
@@ -0,0 +1,221 @@
+/*
+ * net/sched/cls_cgroup.c Control Group Classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rcupdate.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
+
+struct cls_cgroup_head {
+ u32 handle;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_proto *tp;
+ struct rcu_work rwork;
+};
+
+static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
+ u32 classid = task_get_classid(skb);
+
+ if (!classid)
+ return -1;
+ if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+ return -1;
+
+ res->classid = classid;
+ res->class = 0;
+
+ return tcf_exts_exec(skb, &head->exts, res);
+}
+
+static void *cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+{
+ return NULL;
+}
+
+static int cls_cgroup_init(struct tcf_proto *tp)
+{
+ return 0;
+}
+
+static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
+ [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
+};
+
+static void __cls_cgroup_destroy(struct cls_cgroup_head *head)
+{
+ tcf_exts_destroy(&head->exts);
+ tcf_em_tree_destroy(&head->ematches);
+ tcf_exts_put_net(&head->exts);
+ kfree(head);
+}
+
+static void cls_cgroup_destroy_work(struct work_struct *work)
+{
+ struct cls_cgroup_head *head = container_of(to_rcu_work(work),
+ struct cls_cgroup_head,
+ rwork);
+ rtnl_lock();
+ __cls_cgroup_destroy(head);
+ rtnl_unlock();
+}
+
+static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ void **arg, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_CGROUP_MAX + 1];
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+ struct cls_cgroup_head *new;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ if (!head && !handle)
+ return -EINVAL;
+
+ if (head && handle != head->handle)
+ return -ENOENT;
+
+ new = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!new)
+ return -ENOBUFS;
+
+ err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ if (err < 0)
+ goto errout;
+ new->handle = handle;
+ new->tp = tp;
+ err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
+ cgroup_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
+ extack);
+ if (err < 0)
+ goto errout;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &new->ematches);
+ if (err < 0)
+ goto errout;
+
+ rcu_assign_pointer(tp->root, new);
+ if (head) {
+ tcf_exts_get_net(&head->exts);
+ tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
+ }
+ return 0;
+errout:
+ tcf_exts_destroy(&new->exts);
+ kfree(new);
+ return err;
+}
+
+static void cls_cgroup_destroy(struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+
+ /* Head can still be NULL due to cls_cgroup_init(). */
+ if (head) {
+ if (tcf_exts_get_net(&head->exts))
+ tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
+ else
+ __cls_cgroup_destroy(head);
+ }
+}
+
+static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, head, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+skip:
+ arg->count++;
+}
+
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+ struct nlattr *nest;
+
+ t->tcm_handle = head->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &head->exts) < 0 ||
+ tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
+ .kind = "cgroup",
+ .init = cls_cgroup_init,
+ .change = cls_cgroup_change,
+ .classify = cls_cgroup_classify,
+ .destroy = cls_cgroup_destroy,
+ .get = cls_cgroup_get,
+ .delete = cls_cgroup_delete,
+ .walk = cls_cgroup_walk,
+ .dump = cls_cgroup_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_cgroup_cls(void)
+{
+ return register_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+static void __exit exit_cgroup_cls(void)
+{
+ unregister_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+module_init(init_cgroup_cls);
+module_exit(exit_cgroup_cls);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
new file mode 100644
index 000000000..55bf75cb1
--- /dev/null
+++ b/net/sched/cls_flow.c
@@ -0,0 +1,725 @@
+/*
+ * net/sched/cls_flow.c Generic flow classifier
+ *
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/pkt_cls.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/inet_sock.h>
+
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/flow_dissector.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct flow_head {
+ struct list_head filters;
+ struct rcu_head rcu;
+};
+
+struct flow_filter {
+ struct list_head list;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_proto *tp;
+ struct timer_list perturb_timer;
+ u32 perturb_period;
+ u32 handle;
+
+ u32 nkeys;
+ u32 keymask;
+ u32 mode;
+ u32 mask;
+ u32 xor;
+ u32 rshift;
+ u32 addend;
+ u32 divisor;
+ u32 baseclass;
+ u32 hashrnd;
+ struct rcu_work rwork;
+};
+
+static inline u32 addr_fold(void *addr)
+{
+ unsigned long a = (unsigned long)addr;
+
+ return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
+}
+
+static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ __be32 src = flow_get_u32_src(flow);
+
+ if (src)
+ return ntohl(src);
+
+ return addr_fold(skb->sk);
+}
+
+static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ __be32 dst = flow_get_u32_dst(flow);
+
+ if (dst)
+ return ntohl(dst);
+
+ return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true);
+}
+
+static u32 flow_get_proto(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ return flow->basic.ip_proto;
+}
+
+static u32 flow_get_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ if (flow->ports.ports)
+ return ntohs(flow->ports.src);
+
+ return addr_fold(skb->sk);
+}
+
+static u32 flow_get_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ if (flow->ports.ports)
+ return ntohs(flow->ports.dst);
+
+ return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true);
+}
+
+static u32 flow_get_iif(const struct sk_buff *skb)
+{
+ return skb->skb_iif;
+}
+
+static u32 flow_get_priority(const struct sk_buff *skb)
+{
+ return skb->priority;
+}
+
+static u32 flow_get_mark(const struct sk_buff *skb)
+{
+ return skb->mark;
+}
+
+static u32 flow_get_nfct(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ return addr_fold(skb_nfct(skb));
+#else
+ return 0;
+#endif
+}
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#define CTTUPLE(skb, member) \
+({ \
+ enum ip_conntrack_info ctinfo; \
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
+ if (ct == NULL) \
+ goto fallback; \
+ ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
+})
+#else
+#define CTTUPLE(skb, member) \
+({ \
+ goto fallback; \
+ 0; \
+})
+#endif
+
+static u32 flow_get_nfct_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ return ntohl(CTTUPLE(skb, src.u3.ip));
+ case htons(ETH_P_IPV6):
+ return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
+ }
+fallback:
+ return flow_get_src(skb, flow);
+}
+
+static u32 flow_get_nfct_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ return ntohl(CTTUPLE(skb, dst.u3.ip));
+ case htons(ETH_P_IPV6):
+ return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
+ }
+fallback:
+ return flow_get_dst(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ return ntohs(CTTUPLE(skb, src.u.all));
+fallback:
+ return flow_get_proto_src(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
+{
+ return ntohs(CTTUPLE(skb, dst.u.all));
+fallback:
+ return flow_get_proto_dst(skb, flow);
+}
+
+static u32 flow_get_rtclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (skb_dst(skb))
+ return skb_dst(skb)->tclassid;
+#endif
+ return 0;
+}
+
+static u32 flow_get_skuid(const struct sk_buff *skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kuid_t skuid = sk->sk_socket->file->f_cred->fsuid;
+
+ return from_kuid(&init_user_ns, skuid);
+ }
+ return 0;
+}
+
+static u32 flow_get_skgid(const struct sk_buff *skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kgid_t skgid = sk->sk_socket->file->f_cred->fsgid;
+
+ return from_kgid(&init_user_ns, skgid);
+ }
+ return 0;
+}
+
+static u32 flow_get_vlan_tag(const struct sk_buff *skb)
+{
+ u16 uninitialized_var(tag);
+
+ if (vlan_get_tag(skb, &tag) < 0)
+ return 0;
+ return tag & VLAN_VID_MASK;
+}
+
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+ return skb_get_hash(skb);
+}
+
+static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
+{
+ switch (key) {
+ case FLOW_KEY_SRC:
+ return flow_get_src(skb, flow);
+ case FLOW_KEY_DST:
+ return flow_get_dst(skb, flow);
+ case FLOW_KEY_PROTO:
+ return flow_get_proto(skb, flow);
+ case FLOW_KEY_PROTO_SRC:
+ return flow_get_proto_src(skb, flow);
+ case FLOW_KEY_PROTO_DST:
+ return flow_get_proto_dst(skb, flow);
+ case FLOW_KEY_IIF:
+ return flow_get_iif(skb);
+ case FLOW_KEY_PRIORITY:
+ return flow_get_priority(skb);
+ case FLOW_KEY_MARK:
+ return flow_get_mark(skb);
+ case FLOW_KEY_NFCT:
+ return flow_get_nfct(skb);
+ case FLOW_KEY_NFCT_SRC:
+ return flow_get_nfct_src(skb, flow);
+ case FLOW_KEY_NFCT_DST:
+ return flow_get_nfct_dst(skb, flow);
+ case FLOW_KEY_NFCT_PROTO_SRC:
+ return flow_get_nfct_proto_src(skb, flow);
+ case FLOW_KEY_NFCT_PROTO_DST:
+ return flow_get_nfct_proto_dst(skb, flow);
+ case FLOW_KEY_RTCLASSID:
+ return flow_get_rtclassid(skb);
+ case FLOW_KEY_SKUID:
+ return flow_get_skuid(skb);
+ case FLOW_KEY_SKGID:
+ return flow_get_skgid(skb);
+ case FLOW_KEY_VLAN_TAG:
+ return flow_get_vlan_tag(skb);
+ case FLOW_KEY_RXHASH:
+ return flow_get_rxhash(skb);
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+}
+
+#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
+ (1 << FLOW_KEY_DST) | \
+ (1 << FLOW_KEY_PROTO) | \
+ (1 << FLOW_KEY_PROTO_SRC) | \
+ (1 << FLOW_KEY_PROTO_DST) | \
+ (1 << FLOW_KEY_NFCT_SRC) | \
+ (1 << FLOW_KEY_NFCT_DST) | \
+ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \
+ (1 << FLOW_KEY_NFCT_PROTO_DST))
+
+static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct flow_head *head = rcu_dereference_bh(tp->root);
+ struct flow_filter *f;
+ u32 keymask;
+ u32 classid;
+ unsigned int n, key;
+ int r;
+
+ list_for_each_entry_rcu(f, &head->filters, list) {
+ u32 keys[FLOW_KEY_MAX + 1];
+ struct flow_keys flow_keys;
+
+ if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+ continue;
+
+ keymask = f->keymask;
+ if (keymask & FLOW_KEYS_NEEDED)
+ skb_flow_dissect_flow_keys(skb, &flow_keys, 0);
+
+ for (n = 0; n < f->nkeys; n++) {
+ key = ffs(keymask) - 1;
+ keymask &= ~(1 << key);
+ keys[n] = flow_key_get(skb, key, &flow_keys);
+ }
+
+ if (f->mode == FLOW_MODE_HASH)
+ classid = jhash2(keys, f->nkeys, f->hashrnd);
+ else {
+ classid = keys[0];
+ classid = (classid & f->mask) ^ f->xor;
+ classid = (classid >> f->rshift) + f->addend;
+ }
+
+ if (f->divisor)
+ classid %= f->divisor;
+
+ res->class = 0;
+ res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+ return r;
+ }
+ return -1;
+}
+
+static void flow_perturbation(struct timer_list *t)
+{
+ struct flow_filter *f = from_timer(f, t, perturb_timer);
+
+ get_random_bytes(&f->hashrnd, 4);
+ if (f->perturb_period)
+ mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
+}
+
+static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
+ [TCA_FLOW_KEYS] = { .type = NLA_U32 },
+ [TCA_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
+ [TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
+ [TCA_FLOW_ADDEND] = { .type = NLA_U32 },
+ [TCA_FLOW_MASK] = { .type = NLA_U32 },
+ [TCA_FLOW_XOR] = { .type = NLA_U32 },
+ [TCA_FLOW_DIVISOR] = { .type = NLA_U32 },
+ [TCA_FLOW_ACT] = { .type = NLA_NESTED },
+ [TCA_FLOW_POLICE] = { .type = NLA_NESTED },
+ [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED },
+ [TCA_FLOW_PERTURB] = { .type = NLA_U32 },
+};
+
+static void __flow_destroy_filter(struct flow_filter *f)
+{
+ del_timer_sync(&f->perturb_timer);
+ tcf_exts_destroy(&f->exts);
+ tcf_em_tree_destroy(&f->ematches);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void flow_destroy_filter_work(struct work_struct *work)
+{
+ struct flow_filter *f = container_of(to_rcu_work(work),
+ struct flow_filter,
+ rwork);
+ rtnl_lock();
+ __flow_destroy_filter(f);
+ rtnl_unlock();
+}
+
+static int flow_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ void **arg, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *fold, *fnew;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_FLOW_MAX + 1];
+ unsigned int nkeys = 0;
+ unsigned int perturb_period = 0;
+ u32 baseclass = 0;
+ u32 keymask = 0;
+ u32 mode;
+ int err;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FLOW_BASECLASS]) {
+ baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
+ if (TC_H_MIN(baseclass) == 0)
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOW_KEYS]) {
+ keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
+
+ nkeys = hweight32(keymask);
+ if (nkeys == 0)
+ return -EINVAL;
+
+ if (fls(keymask) - 1 > FLOW_KEY_MAX)
+ return -EOPNOTSUPP;
+
+ if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
+ sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
+ return -EOPNOTSUPP;
+ }
+
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &fnew->ematches);
+ if (err < 0)
+ goto err1;
+
+ err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ if (err < 0)
+ goto err2;
+
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
+ extack);
+ if (err < 0)
+ goto err2;
+
+ fold = *arg;
+ if (fold) {
+ err = -EINVAL;
+ if (fold->handle != handle && handle)
+ goto err2;
+
+ /* Copy fold into fnew */
+ fnew->tp = fold->tp;
+ fnew->handle = fold->handle;
+ fnew->nkeys = fold->nkeys;
+ fnew->keymask = fold->keymask;
+ fnew->mode = fold->mode;
+ fnew->mask = fold->mask;
+ fnew->xor = fold->xor;
+ fnew->rshift = fold->rshift;
+ fnew->addend = fold->addend;
+ fnew->divisor = fold->divisor;
+ fnew->baseclass = fold->baseclass;
+ fnew->hashrnd = fold->hashrnd;
+
+ mode = fold->mode;
+ if (tb[TCA_FLOW_MODE])
+ mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+ if (mode != FLOW_MODE_HASH && nkeys > 1)
+ goto err2;
+
+ if (mode == FLOW_MODE_HASH)
+ perturb_period = fold->perturb_period;
+ if (tb[TCA_FLOW_PERTURB]) {
+ if (mode != FLOW_MODE_HASH)
+ goto err2;
+ perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+ }
+ } else {
+ err = -EINVAL;
+ if (!handle)
+ goto err2;
+ if (!tb[TCA_FLOW_KEYS])
+ goto err2;
+
+ mode = FLOW_MODE_MAP;
+ if (tb[TCA_FLOW_MODE])
+ mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+ if (mode != FLOW_MODE_HASH && nkeys > 1)
+ goto err2;
+
+ if (tb[TCA_FLOW_PERTURB]) {
+ if (mode != FLOW_MODE_HASH)
+ goto err2;
+ perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+ }
+
+ if (TC_H_MAJ(baseclass) == 0) {
+ struct Qdisc *q = tcf_block_q(tp->chain->block);
+
+ baseclass = TC_H_MAKE(q->handle, baseclass);
+ }
+ if (TC_H_MIN(baseclass) == 0)
+ baseclass = TC_H_MAKE(baseclass, 1);
+
+ fnew->handle = handle;
+ fnew->mask = ~0U;
+ fnew->tp = tp;
+ get_random_bytes(&fnew->hashrnd, 4);
+ }
+
+ timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE);
+
+ tcf_block_netif_keep_dst(tp->chain->block);
+
+ if (tb[TCA_FLOW_KEYS]) {
+ fnew->keymask = keymask;
+ fnew->nkeys = nkeys;
+ }
+
+ fnew->mode = mode;
+
+ if (tb[TCA_FLOW_MASK])
+ fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+ if (tb[TCA_FLOW_XOR])
+ fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+ if (tb[TCA_FLOW_RSHIFT])
+ fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+ if (tb[TCA_FLOW_ADDEND])
+ fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+
+ if (tb[TCA_FLOW_DIVISOR])
+ fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+ if (baseclass)
+ fnew->baseclass = baseclass;
+
+ fnew->perturb_period = perturb_period;
+ if (perturb_period)
+ mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
+
+ if (!*arg)
+ list_add_tail_rcu(&fnew->list, &head->filters);
+ else
+ list_replace_rcu(&fold->list, &fnew->list);
+
+ *arg = fnew;
+
+ if (fold) {
+ tcf_exts_get_net(&fold->exts);
+ tcf_queue_work(&fold->rwork, flow_destroy_filter_work);
+ }
+ return 0;
+
+err2:
+ tcf_exts_destroy(&fnew->exts);
+ tcf_em_tree_destroy(&fnew->ematches);
+err1:
+ kfree(fnew);
+ return err;
+}
+
+static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f = arg;
+
+ list_del_rcu(&f->list);
+ tcf_exts_get_net(&f->exts);
+ tcf_queue_work(&f->rwork, flow_destroy_filter_work);
+ *last = list_empty(&head->filters);
+ return 0;
+}
+
+static int flow_init(struct tcf_proto *tp)
+{
+ struct flow_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ INIT_LIST_HEAD(&head->filters);
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f, *next;
+
+ list_for_each_entry_safe(f, next, &head->filters, list) {
+ list_del_rcu(&f->list);
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, flow_destroy_filter_work);
+ else
+ __flow_destroy_filter(f);
+ }
+ kfree_rcu(head, rcu);
+}
+
+static void *flow_get(struct tcf_proto *tp, u32 handle)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f;
+
+ list_for_each_entry(f, &head->filters, list)
+ if (f->handle == handle)
+ return f;
+ return NULL;
+}
+
+static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct flow_filter *f = fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
+ nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
+ goto nla_put_failure;
+
+ if (f->mask != ~0 || f->xor != 0) {
+ if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
+ nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
+ goto nla_put_failure;
+ }
+ if (f->rshift &&
+ nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
+ goto nla_put_failure;
+ if (f->addend &&
+ nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
+ goto nla_put_failure;
+
+ if (f->divisor &&
+ nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
+ goto nla_put_failure;
+ if (f->baseclass &&
+ nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
+ goto nla_put_failure;
+
+ if (f->perturb_period &&
+ nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+#ifdef CONFIG_NET_EMATCH
+ if (f->ematches.hdr.nmatches &&
+ tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
+ goto nla_put_failure;
+#endif
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f;
+
+ list_for_each_entry(f, &head->filters, list) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static struct tcf_proto_ops cls_flow_ops __read_mostly = {
+ .kind = "flow",
+ .classify = flow_classify,
+ .init = flow_init,
+ .destroy = flow_destroy,
+ .change = flow_change,
+ .delete = flow_delete,
+ .get = flow_get,
+ .dump = flow_dump,
+ .walk = flow_walk,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_flow_init(void)
+{
+ return register_tcf_proto_ops(&cls_flow_ops);
+}
+
+static void __exit cls_flow_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_flow_ops);
+}
+
+module_init(cls_flow_init);
+module_exit(cls_flow_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("TC flow classifier");
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
new file mode 100644
index 000000000..616364814
--- /dev/null
+++ b/net/sched/cls_flower.c
@@ -0,0 +1,2001 @@
+/*
+ * net/sched/cls_flower.c Flower classifier
+ *
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rhashtable.h>
+#include <linux/workqueue.h>
+
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/mpls.h>
+
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/flow_dissector.h>
+#include <net/geneve.h>
+
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+struct fl_flow_key {
+ int indev_ifindex;
+ struct flow_dissector_key_control control;
+ struct flow_dissector_key_control enc_control;
+ struct flow_dissector_key_basic basic;
+ struct flow_dissector_key_eth_addrs eth;
+ struct flow_dissector_key_vlan vlan;
+ struct flow_dissector_key_vlan cvlan;
+ union {
+ struct flow_dissector_key_ipv4_addrs ipv4;
+ struct flow_dissector_key_ipv6_addrs ipv6;
+ };
+ struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_icmp icmp;
+ struct flow_dissector_key_arp arp;
+ struct flow_dissector_key_keyid enc_key_id;
+ union {
+ struct flow_dissector_key_ipv4_addrs enc_ipv4;
+ struct flow_dissector_key_ipv6_addrs enc_ipv6;
+ };
+ struct flow_dissector_key_ports enc_tp;
+ struct flow_dissector_key_mpls mpls;
+ struct flow_dissector_key_tcp tcp;
+ struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_ip enc_ip;
+ struct flow_dissector_key_enc_opts enc_opts;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct fl_flow_mask_range {
+ unsigned short int start;
+ unsigned short int end;
+};
+
+struct fl_flow_mask {
+ struct fl_flow_key key;
+ struct fl_flow_mask_range range;
+ struct rhash_head ht_node;
+ struct rhashtable ht;
+ struct rhashtable_params filter_ht_params;
+ struct flow_dissector dissector;
+ struct list_head filters;
+ struct rcu_work rwork;
+ struct list_head list;
+};
+
+struct fl_flow_tmplt {
+ struct fl_flow_key dummy_key;
+ struct fl_flow_key mask;
+ struct flow_dissector dissector;
+ struct tcf_chain *chain;
+};
+
+struct cls_fl_head {
+ struct rhashtable ht;
+ struct list_head masks;
+ struct rcu_work rwork;
+ struct idr handle_idr;
+};
+
+struct cls_fl_filter {
+ struct fl_flow_mask *mask;
+ struct rhash_head ht_node;
+ struct fl_flow_key mkey;
+ struct tcf_exts exts;
+ struct tcf_result res;
+ struct fl_flow_key key;
+ struct list_head list;
+ u32 handle;
+ u32 flags;
+ unsigned int in_hw_count;
+ struct rcu_work rwork;
+ struct net_device *hw_dev;
+};
+
+static const struct rhashtable_params mask_ht_params = {
+ .key_offset = offsetof(struct fl_flow_mask, key),
+ .key_len = sizeof(struct fl_flow_key),
+ .head_offset = offsetof(struct fl_flow_mask, ht_node),
+ .automatic_shrinking = true,
+};
+
+static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
+{
+ return mask->range.end - mask->range.start;
+}
+
+static void fl_mask_update_range(struct fl_flow_mask *mask)
+{
+ const u8 *bytes = (const u8 *) &mask->key;
+ size_t size = sizeof(mask->key);
+ size_t i, first = 0, last;
+
+ for (i = 0; i < size; i++) {
+ if (bytes[i]) {
+ first = i;
+ break;
+ }
+ }
+ last = first;
+ for (i = size - 1; i != first; i--) {
+ if (bytes[i]) {
+ last = i;
+ break;
+ }
+ }
+ mask->range.start = rounddown(first, sizeof(long));
+ mask->range.end = roundup(last + 1, sizeof(long));
+}
+
+static void *fl_key_get_start(struct fl_flow_key *key,
+ const struct fl_flow_mask *mask)
+{
+ return (u8 *) key + mask->range.start;
+}
+
+static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key,
+ struct fl_flow_mask *mask)
+{
+ const long *lkey = fl_key_get_start(key, mask);
+ const long *lmask = fl_key_get_start(&mask->key, mask);
+ long *lmkey = fl_key_get_start(mkey, mask);
+ int i;
+
+ for (i = 0; i < fl_mask_range(mask); i += sizeof(long))
+ *lmkey++ = *lkey++ & *lmask++;
+}
+
+static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt,
+ struct fl_flow_mask *mask)
+{
+ const long *lmask = fl_key_get_start(&mask->key, mask);
+ const long *ltmplt;
+ int i;
+
+ if (!tmplt)
+ return true;
+ ltmplt = fl_key_get_start(&tmplt->mask, mask);
+ for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) {
+ if (~*ltmplt++ & *lmask++)
+ return false;
+ }
+ return true;
+}
+
+static void fl_clear_masked_range(struct fl_flow_key *key,
+ struct fl_flow_mask *mask)
+{
+ memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey)
+{
+ return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
+ mask->filter_ht_params);
+}
+
+static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_fl_head *head = rcu_dereference_bh(tp->root);
+ struct cls_fl_filter *f;
+ struct fl_flow_mask *mask;
+ struct fl_flow_key skb_key;
+ struct fl_flow_key skb_mkey;
+
+ list_for_each_entry_rcu(mask, &head->masks, list) {
+ flow_dissector_init_keys(&skb_key.control, &skb_key.basic);
+ fl_clear_masked_range(&skb_key, mask);
+
+ skb_key.indev_ifindex = skb->skb_iif;
+ /* skb_flow_dissect() does not set n_proto in case an unknown
+ * protocol, so do it rather here.
+ */
+ skb_key.basic.n_proto = skb_protocol(skb, false);
+ skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
+
+ fl_set_masked_key(&skb_mkey, &skb_key, mask);
+
+ f = fl_lookup(mask, &skb_mkey);
+ if (f && !tc_skip_sw(f->flags)) {
+ *res = f->res;
+ return tcf_exts_exec(skb, &f->exts, res);
+ }
+ }
+ return -1;
+}
+
+static int fl_init(struct tcf_proto *tp)
+{
+ struct cls_fl_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD_RCU(&head->masks);
+ rcu_assign_pointer(tp->root, head);
+ idr_init(&head->handle_idr);
+
+ return rhashtable_init(&head->ht, &mask_ht_params);
+}
+
+static void fl_mask_free(struct fl_flow_mask *mask)
+{
+ rhashtable_destroy(&mask->ht);
+ kfree(mask);
+}
+
+static void fl_mask_free_work(struct work_struct *work)
+{
+ struct fl_flow_mask *mask = container_of(to_rcu_work(work),
+ struct fl_flow_mask, rwork);
+
+ fl_mask_free(mask);
+}
+
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
+ bool async)
+{
+ if (!list_empty(&mask->filters))
+ return false;
+
+ rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+ list_del_rcu(&mask->list);
+ if (async)
+ tcf_queue_work(&mask->rwork, fl_mask_free_work);
+ else
+ fl_mask_free(mask);
+
+ return true;
+}
+
+static void __fl_destroy_filter(struct cls_fl_filter *f)
+{
+ tcf_exts_destroy(&f->exts);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void fl_destroy_filter_work(struct work_struct *work)
+{
+ struct cls_fl_filter *f = container_of(to_rcu_work(work),
+ struct cls_fl_filter, rwork);
+
+ rtnl_lock();
+ __fl_destroy_filter(f);
+ rtnl_unlock();
+}
+
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
+ cls_flower.command = TC_CLSFLOWER_DESTROY;
+ cls_flower.cookie = (unsigned long) f;
+
+ tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+ tcf_block_offload_dec(block, &f->flags);
+}
+
+static int fl_hw_replace_filter(struct tcf_proto *tp,
+ struct cls_fl_filter *f,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
+ bool skip_sw = tc_skip_sw(f->flags);
+ int err;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
+ cls_flower.command = TC_CLSFLOWER_REPLACE;
+ cls_flower.cookie = (unsigned long) f;
+ cls_flower.dissector = &f->mask->dissector;
+ cls_flower.mask = &f->mask->key;
+ cls_flower.key = &f->mkey;
+ cls_flower.exts = &f->exts;
+ cls_flower.classid = f->res.classid;
+
+ err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
+ &cls_flower, skip_sw);
+ if (err < 0) {
+ fl_hw_destroy_filter(tp, f, NULL);
+ return err;
+ } else if (err > 0) {
+ f->in_hw_count = err;
+ tcf_block_offload_inc(block, &f->flags);
+ }
+
+ if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
+ cls_flower.command = TC_CLSFLOWER_STATS;
+ cls_flower.cookie = (unsigned long) f;
+ cls_flower.exts = &f->exts;
+ cls_flower.classid = f->res.classid;
+
+ tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ bool async = tcf_exts_get_net(&f->exts);
+ bool last;
+
+ idr_remove(&head->handle_idr, f->handle);
+ list_del_rcu(&f->list);
+ last = fl_mask_put(head, f->mask, async);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f, extack);
+ tcf_unbind_filter(tp, &f->res);
+ if (async)
+ tcf_queue_work(&f->rwork, fl_destroy_filter_work);
+ else
+ __fl_destroy_filter(f);
+
+ return last;
+}
+
+static void fl_destroy_sleepable(struct work_struct *work)
+{
+ struct cls_fl_head *head = container_of(to_rcu_work(work),
+ struct cls_fl_head,
+ rwork);
+
+ rhashtable_destroy(&head->ht);
+ kfree(head);
+ module_put(THIS_MODULE);
+}
+
+static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct fl_flow_mask *mask, *next_mask;
+ struct cls_fl_filter *f, *next;
+
+ list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
+ list_for_each_entry_safe(f, next, &mask->filters, list) {
+ if (__fl_delete(tp, f, extack))
+ break;
+ }
+ }
+ idr_destroy(&head->handle_idr);
+
+ __module_get(THIS_MODULE);
+ tcf_queue_work(&head->rwork, fl_destroy_sleepable);
+}
+
+static void *fl_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+
+ return idr_find(&head->handle_idr, handle);
+}
+
+static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
+ [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TCA_FLOWER_CLASSID] = { .type = NLA_U32 },
+ [TCA_FLOWER_INDEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ },
+ [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_MPLS_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
+ .len = 128 },
+};
+
+static void fl_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type, int len)
+{
+ if (!tb[val_type])
+ return;
+ memcpy(val, nla_data(tb[val_type]), len);
+ if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ memcpy(mask, nla_data(tb[mask_type]), len);
+}
+
+static int fl_set_key_mpls(struct nlattr **tb,
+ struct flow_dissector_key_mpls *key_val,
+ struct flow_dissector_key_mpls *key_mask)
+{
+ if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
+ key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
+ key_mask->mpls_ttl = MPLS_TTL_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
+ u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
+
+ if (bos & ~MPLS_BOS_MASK)
+ return -EINVAL;
+ key_val->mpls_bos = bos;
+ key_mask->mpls_bos = MPLS_BOS_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
+ u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
+
+ if (tc & ~MPLS_TC_MASK)
+ return -EINVAL;
+ key_val->mpls_tc = tc;
+ key_mask->mpls_tc = MPLS_TC_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
+ u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
+
+ if (label & ~MPLS_LABEL_MASK)
+ return -EINVAL;
+ key_val->mpls_label = label;
+ key_mask->mpls_label = MPLS_LABEL_MASK;
+ }
+ return 0;
+}
+
+static void fl_set_key_vlan(struct nlattr **tb,
+ __be16 ethertype,
+ int vlan_id_key, int vlan_prio_key,
+ int vlan_next_eth_type_key,
+ struct flow_dissector_key_vlan *key_val,
+ struct flow_dissector_key_vlan *key_mask)
+{
+#define VLAN_PRIORITY_MASK 0x7
+
+ if (tb[vlan_id_key]) {
+ key_val->vlan_id =
+ nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
+ key_mask->vlan_id = VLAN_VID_MASK;
+ }
+ if (tb[vlan_prio_key]) {
+ key_val->vlan_priority =
+ nla_get_u8(tb[vlan_prio_key]) &
+ VLAN_PRIORITY_MASK;
+ key_mask->vlan_priority = VLAN_PRIORITY_MASK;
+ }
+ key_val->vlan_tpid = ethertype;
+ key_mask->vlan_tpid = cpu_to_be16(~0);
+ if (tb[vlan_next_eth_type_key]) {
+ key_val->vlan_eth_type =
+ nla_get_be16(tb[vlan_next_eth_type_key]);
+ key_mask->vlan_eth_type = cpu_to_be16(~0);
+ }
+}
+
+static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
+ u32 *dissector_key, u32 *dissector_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (flower_mask & flower_flag_bit) {
+ *dissector_mask |= dissector_flag_bit;
+ if (flower_key & flower_flag_bit)
+ *dissector_key |= dissector_flag_bit;
+ }
+}
+
+static int fl_set_key_flags(struct nlattr **tb,
+ u32 *flags_key, u32 *flags_mask)
+{
+ u32 key, mask;
+
+ /* mask is mandatory for flags */
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ return -EINVAL;
+
+ key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
+ mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+
+ *flags_key = 0;
+ *flags_mask = 0;
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
+ FLOW_DIS_FIRST_FRAG);
+
+ return 0;
+}
+
+static void fl_set_key_ip(struct nlattr **tb, bool encap,
+ struct flow_dissector_key_ip *key,
+ struct flow_dissector_key_ip *mask)
+{
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+ fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
+ fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
+}
+
+static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1];
+ struct nlattr *class = NULL, *type = NULL, *data = NULL;
+ struct geneve_opt *opt;
+ int err, data_len = 0;
+
+ if (option_len > sizeof(struct geneve_opt))
+ data_len = option_len - sizeof(struct geneve_opt);
+
+ opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
+ memset(opt, 0xff, option_len);
+ opt->length = data_len / 4;
+ opt->r1 = 0;
+ opt->r2 = 0;
+ opt->r3 = 0;
+
+ /* If no mask has been prodived we assume an exact match. */
+ if (!depth)
+ return sizeof(struct geneve_opt) + data_len;
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) {
+ NL_SET_ERR_MSG(extack, "Non-geneve option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ /* We are not allowed to omit any of CLASS, TYPE or DATA
+ * fields from the key.
+ */
+ if (!option_len &&
+ (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+ return -EINVAL;
+ }
+
+ /* Omitting any of CLASS, TYPE or DATA fields is allowed
+ * for the mask.
+ */
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) {
+ int new_len = key->enc_opts.len;
+
+ data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA];
+ data_len = nla_len(data);
+ if (data_len < 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+ return -ERANGE;
+ }
+ if (data_len % 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+ return -ERANGE;
+ }
+
+ new_len += sizeof(struct geneve_opt) + data_len;
+ BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX);
+ if (new_len > FLOW_DIS_TUN_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -ERANGE;
+ }
+ opt->length = data_len / 4;
+ memcpy(opt->opt_data, nla_data(data), data_len);
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) {
+ class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(class);
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) {
+ type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(type);
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
+ int err, option_len, key_depth, msk_depth = 0;
+
+ err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
+ err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ }
+
+ nla_for_each_attr(nla_opt_key, nla_enc_key,
+ nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
+ switch (nla_type(nla_opt_key)) {
+ case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ option_len = fl_set_geneve_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ option_len = fl_set_geneve_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int fl_set_key(struct net *net, struct nlattr **tb,
+ struct fl_flow_key *key, struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ __be16 ethertype;
+ int ret = 0;
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_FLOWER_INDEV]) {
+ int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
+ if (err < 0)
+ return err;
+ key->indev_ifindex = err;
+ mask->indev_ifindex = 0xffffffff;
+ }
+#endif
+
+ fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
+ mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
+ sizeof(key->eth.dst));
+ fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
+ mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
+ sizeof(key->eth.src));
+
+ if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
+ ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
+
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO,
+ TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ &key->vlan, &mask->vlan);
+
+ if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) {
+ ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype,
+ TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &key->cvlan, &mask->cvlan);
+ fl_set_key_val(tb, &key->basic.n_proto,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &mask->basic.n_proto,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
+ } else {
+ key->basic.n_proto = ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ }
+ } else {
+ key->basic.n_proto = ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ }
+
+ if (key->basic.n_proto == htons(ETH_P_IP) ||
+ key->basic.n_proto == htons(ETH_P_IPV6)) {
+ fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
+ &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.ip_proto));
+ fl_set_key_ip(tb, false, &key->ip, &mask->ip);
+ }
+
+ if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->control.addr_type = ~0;
+ fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
+ &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+ sizeof(key->ipv4.src));
+ fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
+ &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
+ sizeof(key->ipv4.dst));
+ } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->control.addr_type = ~0;
+ fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
+ &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+ sizeof(key->ipv6.src));
+ fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST,
+ &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK,
+ sizeof(key->ipv6.dst));
+ }
+
+ if (key->basic.ip_proto == IPPROTO_TCP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
+ sizeof(key->tp.dst));
+ fl_set_key_val(tb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS,
+ &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK,
+ sizeof(key->tcp.flags));
+ } else if (key->basic.ip_proto == IPPROTO_UDP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
+ sizeof(key->tp.dst));
+ } else if (key->basic.ip_proto == IPPROTO_SCTP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
+ sizeof(key->tp.dst));
+ } else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) ||
+ key->basic.n_proto == htons(ETH_P_MPLS_MC)) {
+ ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls);
+ if (ret)
+ return ret;
+ } else if (key->basic.n_proto == htons(ETH_P_ARP) ||
+ key->basic.n_proto == htons(ETH_P_RARP)) {
+ fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
+ &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
+ sizeof(key->arp.sip));
+ fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
+ &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
+ sizeof(key->arp.tip));
+ fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
+ &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
+ sizeof(key->arp.op));
+ fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+ mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+ sizeof(key->arp.sha));
+ fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+ mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+ sizeof(key->arp.tha));
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->enc_control.addr_type = ~0;
+ fl_set_key_val(tb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC,
+ &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src));
+ fl_set_key_val(tb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST,
+ &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst));
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->enc_control.addr_type = ~0;
+ fl_set_key_val(tb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC,
+ &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src));
+ fl_set_key_val(tb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst));
+ }
+
+ fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
+ sizeof(key->enc_key_id.keyid));
+
+ fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src));
+
+ fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst));
+
+ fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
+ ret = fl_set_enc_opt(tb, key, mask, extack);
+ if (ret)
+ return ret;
+ }
+
+ if (tb[TCA_FLOWER_KEY_FLAGS])
+ ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+
+ return ret;
+}
+
+static void fl_mask_copy(struct fl_flow_mask *dst,
+ struct fl_flow_mask *src)
+{
+ const void *psrc = fl_key_get_start(&src->key, src);
+ void *pdst = fl_key_get_start(&dst->key, src);
+
+ memcpy(pdst, psrc, fl_mask_range(src));
+ dst->range = src->range;
+}
+
+static const struct rhashtable_params fl_ht_params = {
+ .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */
+ .head_offset = offsetof(struct cls_fl_filter, ht_node),
+ .automatic_shrinking = true,
+};
+
+static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
+{
+ mask->filter_ht_params = fl_ht_params;
+ mask->filter_ht_params.key_len = fl_mask_range(mask);
+ mask->filter_ht_params.key_offset += mask->range.start;
+
+ return rhashtable_init(&mask->ht, &mask->filter_ht_params);
+}
+
+#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
+
+#define FL_KEY_IS_MASKED(mask, member) \
+ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
+ 0, FL_KEY_MEMBER_SIZE(member)) \
+
+#define FL_KEY_SET(keys, cnt, id, member) \
+ do { \
+ keys[cnt].key_id = id; \
+ keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \
+ cnt++; \
+ } while(0);
+
+#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \
+ do { \
+ if (FL_KEY_IS_MASKED(mask, member)) \
+ FL_KEY_SET(keys, cnt, id, member); \
+ } while(0);
+
+static void fl_init_dissector(struct flow_dissector *dissector,
+ struct fl_flow_key *mask)
+{
+ struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
+ size_t cnt = 0;
+
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IP, ip);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_TCP, tcp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ICMP, icmp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ARP, arp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_MPLS, mpls);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_VLAN, vlan);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CVLAN, cvlan);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
+ if (FL_KEY_IS_MASKED(mask, enc_ipv4) ||
+ FL_KEY_IS_MASKED(mask, enc_ipv6))
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ enc_control);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+
+ skb_flow_dissector_init(dissector, keys, cnt);
+}
+
+static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ struct fl_flow_mask *newmask;
+ int err;
+
+ newmask = kzalloc(sizeof(*newmask), GFP_KERNEL);
+ if (!newmask)
+ return ERR_PTR(-ENOMEM);
+
+ fl_mask_copy(newmask, mask);
+
+ err = fl_init_mask_hashtable(newmask);
+ if (err)
+ goto errout_free;
+
+ fl_init_dissector(&newmask->dissector, &newmask->key);
+
+ INIT_LIST_HEAD_RCU(&newmask->filters);
+
+ err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
+ mask_ht_params);
+ if (err)
+ goto errout_destroy;
+
+ list_add_tail_rcu(&newmask->list, &head->masks);
+
+ return newmask;
+
+errout_destroy:
+ rhashtable_destroy(&newmask->ht);
+errout_free:
+ kfree(newmask);
+
+ return ERR_PTR(err);
+}
+
+static int fl_check_assign_mask(struct cls_fl_head *head,
+ struct cls_fl_filter *fnew,
+ struct cls_fl_filter *fold,
+ struct fl_flow_mask *mask)
+{
+ struct fl_flow_mask *newmask;
+
+ fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+ if (!fnew->mask) {
+ if (fold)
+ return -EINVAL;
+
+ newmask = fl_create_new_mask(head, mask);
+ if (IS_ERR(newmask))
+ return PTR_ERR(newmask);
+
+ fnew->mask = newmask;
+ } else if (fold && fold->mask != fnew->mask) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fl_set_parms(struct net *net, struct tcf_proto *tp,
+ struct cls_fl_filter *f, struct fl_flow_mask *mask,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr,
+ struct fl_flow_tmplt *tmplt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FLOWER_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ err = fl_set_key(net, tb, &f->key, &mask->key, extack);
+ if (err)
+ return err;
+
+ fl_mask_update_range(mask);
+ fl_set_masked_key(&f->mkey, &f->key, mask);
+
+ if (!fl_mask_fits_tmplt(tmplt, mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fl_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ void **arg, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *fold = *arg;
+ struct cls_fl_filter *fnew;
+ struct fl_flow_mask *mask;
+ struct nlattr **tb;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL);
+ if (!mask)
+ return -ENOBUFS;
+
+ tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+ if (!tb) {
+ err = -ENOBUFS;
+ goto errout_mask_alloc;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
+ fl_policy, NULL);
+ if (err < 0)
+ goto errout_tb;
+
+ if (fold && handle && fold->handle != handle) {
+ err = -EINVAL;
+ goto errout_tb;
+ }
+
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew) {
+ err = -ENOBUFS;
+ goto errout_tb;
+ }
+
+ err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
+ if (err < 0)
+ goto errout;
+
+ if (tb[TCA_FLOWER_FLAGS]) {
+ fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
+ if (!tc_flags_valid(fnew->flags)) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
+ tp->chain->tmplt_priv, extack);
+ if (err)
+ goto errout;
+
+ err = fl_check_assign_mask(head, fnew, fold, mask);
+ if (err)
+ goto errout;
+
+ if (!handle) {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ INT_MAX, GFP_KERNEL);
+ } else if (!fold) {
+ /* user specifies a handle and it doesn't exist */
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ handle, GFP_KERNEL);
+ }
+ if (err)
+ goto errout_mask;
+ fnew->handle = handle;
+
+ if (!tc_skip_sw(fnew->flags)) {
+ if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
+ err = -EEXIST;
+ goto errout_idr;
+ }
+
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
+ if (err)
+ goto errout_idr;
+ }
+
+ if (!tc_skip_hw(fnew->flags)) {
+ err = fl_hw_replace_filter(tp, fnew, extack);
+ if (err)
+ goto errout_mask;
+ }
+
+ if (!tc_in_hw(fnew->flags))
+ fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
+ if (fold) {
+ if (!tc_skip_sw(fold->flags))
+ rhashtable_remove_fast(&fold->mask->ht,
+ &fold->ht_node,
+ fold->mask->filter_ht_params);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold, NULL);
+ }
+
+ *arg = fnew;
+
+ if (fold) {
+ idr_replace(&head->handle_idr, fnew, fnew->handle);
+ list_replace_rcu(&fold->list, &fnew->list);
+ tcf_unbind_filter(tp, &fold->res);
+ tcf_exts_get_net(&fold->exts);
+ tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
+ } else {
+ list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
+ }
+
+ kfree(tb);
+ kfree(mask);
+ return 0;
+
+errout_idr:
+ if (!fold)
+ idr_remove(&head->handle_idr, fnew->handle);
+
+errout_mask:
+ fl_mask_put(head, fnew->mask, false);
+
+errout:
+ tcf_exts_destroy(&fnew->exts);
+ kfree(fnew);
+errout_tb:
+ kfree(tb);
+errout_mask_alloc:
+ kfree(mask);
+ return err;
+}
+
+static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f = arg;
+
+ if (!tc_skip_sw(f->flags))
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
+ __fl_delete(tp, f, extack);
+ *last = list_empty(&head->masks);
+ return 0;
+}
+
+static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_filter *f;
+
+ arg->count = arg->skip;
+
+ while ((f = idr_get_next_ul(&head->handle_idr,
+ &arg->cookie)) != NULL) {
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->cookie = f->handle + 1;
+ arg->count++;
+ }
+}
+
+static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
+ struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
+ int err;
+
+ list_for_each_entry(mask, &head->masks, list) {
+ list_for_each_entry(f, &mask->filters, list) {
+ if (tc_skip_hw(f->flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp,
+ f->flags, extack);
+ cls_flower.command = add ?
+ TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+ cls_flower.cookie = (unsigned long)f;
+ cls_flower.dissector = &mask->dissector;
+ cls_flower.mask = &mask->key;
+ cls_flower.key = &f->mkey;
+ cls_flower.exts = &f->exts;
+ cls_flower.classid = f->res.classid;
+
+ err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(f->flags))
+ return err;
+ continue;
+ }
+
+ tc_cls_offload_cnt_update(block, &f->in_hw_count,
+ &f->flags, add);
+ }
+ }
+
+ return 0;
+}
+
+static void fl_hw_create_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+ struct tcf_exts dummy_exts = { 0, };
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.cookie = (unsigned long) tmplt;
+ cls_flower.dissector = &tmplt->dissector;
+ cls_flower.mask = &tmplt->mask;
+ cls_flower.key = &tmplt->dummy_key;
+ cls_flower.exts = &dummy_exts;
+
+ /* We don't care if driver (any of them) fails to handle this
+ * call. It serves just as a hint for it.
+ */
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.cookie = (unsigned long) tmplt;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ struct fl_flow_tmplt *tmplt;
+ struct nlattr **tb;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return ERR_PTR(-EINVAL);
+
+ tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+ if (!tb)
+ return ERR_PTR(-ENOBUFS);
+ err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
+ fl_policy, NULL);
+ if (err)
+ goto errout_tb;
+
+ tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL);
+ if (!tmplt) {
+ err = -ENOMEM;
+ goto errout_tb;
+ }
+ tmplt->chain = chain;
+ err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
+ if (err)
+ goto errout_tmplt;
+ kfree(tb);
+
+ fl_init_dissector(&tmplt->dissector, &tmplt->mask);
+
+ fl_hw_create_tmplt(chain, tmplt);
+
+ return tmplt;
+
+errout_tmplt:
+ kfree(tmplt);
+errout_tb:
+ kfree(tb);
+ return ERR_PTR(err);
+}
+
+static void fl_tmplt_destroy(void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+
+ fl_hw_destroy_tmplt(tmplt->chain, tmplt);
+ kfree(tmplt);
+}
+
+static int fl_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type, int len)
+{
+ int err;
+
+ if (!memchr_inv(mask, 0, len))
+ return 0;
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+ if (mask_type != TCA_FLOWER_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int fl_dump_key_mpls(struct sk_buff *skb,
+ struct flow_dissector_key_mpls *mpls_key,
+ struct flow_dissector_key_mpls *mpls_mask)
+{
+ int err;
+
+ if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask)))
+ return 0;
+ if (mpls_mask->mpls_ttl) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL,
+ mpls_key->mpls_ttl);
+ if (err)
+ return err;
+ }
+ if (mpls_mask->mpls_tc) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC,
+ mpls_key->mpls_tc);
+ if (err)
+ return err;
+ }
+ if (mpls_mask->mpls_label) {
+ err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL,
+ mpls_key->mpls_label);
+ if (err)
+ return err;
+ }
+ if (mpls_mask->mpls_bos) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS,
+ mpls_key->mpls_bos);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
+ struct flow_dissector_key_ip *key,
+ struct flow_dissector_key_ip *mask)
+{
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+ if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
+ fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
+ return -1;
+
+ return 0;
+}
+
+static int fl_dump_key_vlan(struct sk_buff *skb,
+ int vlan_id_key, int vlan_prio_key,
+ struct flow_dissector_key_vlan *vlan_key,
+ struct flow_dissector_key_vlan *vlan_mask)
+{
+ int err;
+
+ if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
+ return 0;
+ if (vlan_mask->vlan_id) {
+ err = nla_put_u16(skb, vlan_id_key,
+ vlan_key->vlan_id);
+ if (err)
+ return err;
+ }
+ if (vlan_mask->vlan_priority) {
+ err = nla_put_u8(skb, vlan_prio_key,
+ vlan_key->vlan_priority);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
+ u32 *flower_key, u32 *flower_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (dissector_mask & dissector_flag_bit) {
+ *flower_mask |= flower_flag_bit;
+ if (dissector_key & dissector_flag_bit)
+ *flower_key |= flower_flag_bit;
+ }
+}
+
+static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
+{
+ u32 key, mask;
+ __be32 _key, _mask;
+ int err;
+
+ if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
+ return 0;
+
+ key = 0;
+ mask = 0;
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
+ FLOW_DIS_FIRST_FRAG);
+
+ _key = cpu_to_be32(key);
+ _mask = cpu_to_be32(mask);
+
+ err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
+ if (err)
+ return err;
+
+ return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
+}
+
+static int fl_dump_key_geneve_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct geneve_opt *opt;
+ struct nlattr *nest;
+ int opt_off = 0;
+
+ nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
+ if (!nest)
+ goto nla_put_failure;
+
+ while (enc_opts->len > opt_off) {
+ opt = (struct geneve_opt *)&enc_opts->data[opt_off];
+
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,
+ opt->opt_class))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,
+ opt->type))
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto nla_put_failure;
+
+ opt_off += sizeof(struct geneve_opt) + opt->length * 4;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct nlattr *nest;
+ int err;
+
+ if (!enc_opts->len)
+ return 0;
+
+ nest = nla_nest_start(skb, enc_opt_type);
+ if (!nest)
+ goto nla_put_failure;
+
+ switch (enc_opts->dst_opt_type) {
+ case TUNNEL_GENEVE_OPT:
+ err = fl_dump_key_geneve_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ default:
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_enc_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *key_opts,
+ struct flow_dissector_key_enc_opts *msk_opts)
+{
+ int err;
+
+ err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts);
+ if (err)
+ return err;
+
+ return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts);
+}
+
+static int fl_dump_key(struct sk_buff *skb, struct net *net,
+ struct fl_flow_key *key, struct fl_flow_key *mask)
+{
+ if (mask->indev_ifindex) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_index(net, key->indev_ifindex);
+ if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+
+ if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
+ mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
+ sizeof(key->eth.dst)) ||
+ fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
+ mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
+ sizeof(key->eth.src)) ||
+ fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
+ &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
+ goto nla_put_failure;
+
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan))
+ goto nla_put_failure;
+
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ &key->cvlan, &mask->cvlan) ||
+ (mask->cvlan.vlan_tpid &&
+ nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->cvlan.vlan_tpid)))
+ goto nla_put_failure;
+
+ if (mask->basic.n_proto) {
+ if (mask->cvlan.vlan_eth_type) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ key->basic.n_proto))
+ goto nla_put_failure;
+ } else if (mask->vlan.vlan_eth_type) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->vlan.vlan_eth_type))
+ goto nla_put_failure;
+ }
+ }
+
+ if ((key->basic.n_proto == htons(ETH_P_IP) ||
+ key->basic.n_proto == htons(ETH_P_IPV6)) &&
+ (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
+ &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.ip_proto)) ||
+ fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
+ goto nla_put_failure;
+
+ if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+ (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
+ &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
+ sizeof(key->ipv4.src)) ||
+ fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
+ &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
+ sizeof(key->ipv4.dst))))
+ goto nla_put_failure;
+ else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+ (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
+ &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+ sizeof(key->ipv6.src)) ||
+ fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST,
+ &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK,
+ sizeof(key->ipv6.dst))))
+ goto nla_put_failure;
+
+ if (key->basic.ip_proto == IPPROTO_TCP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
+ sizeof(key->tp.dst)) ||
+ fl_dump_key_val(skb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS,
+ &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK,
+ sizeof(key->tcp.flags))))
+ goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_UDP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_SCTP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6 &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
+ key->basic.n_proto == htons(ETH_P_RARP)) &&
+ (fl_dump_key_val(skb, &key->arp.sip,
+ TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
+ TCA_FLOWER_KEY_ARP_SIP_MASK,
+ sizeof(key->arp.sip)) ||
+ fl_dump_key_val(skb, &key->arp.tip,
+ TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
+ TCA_FLOWER_KEY_ARP_TIP_MASK,
+ sizeof(key->arp.tip)) ||
+ fl_dump_key_val(skb, &key->arp.op,
+ TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
+ TCA_FLOWER_KEY_ARP_OP_MASK,
+ sizeof(key->arp.op)) ||
+ fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+ mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+ sizeof(key->arp.sha)) ||
+ fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+ mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+ sizeof(key->arp.tha))))
+ goto nla_put_failure;
+
+ if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst))))
+ goto nla_put_failure;
+ else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst))))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id, TCA_FLOWER_UNSPEC,
+ sizeof(key->enc_key_id)) ||
+ fl_dump_key_val(skb, &key->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src)) ||
+ fl_dump_key_val(skb, &key->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst)) ||
+ fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) ||
+ fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
+ goto nla_put_failure;
+
+ if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_fl_filter *f = fh;
+ struct nlattr *nest;
+ struct fl_flow_key *key, *mask;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ key = &f->key;
+ mask = &f->mask->key;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
+
+ if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+ struct fl_flow_key *key, *mask;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ key = &tmplt->dummy_key;
+ mask = &tmplt->mask;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct cls_fl_filter *f = fh;
+
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static struct tcf_proto_ops cls_fl_ops __read_mostly = {
+ .kind = "flower",
+ .classify = fl_classify,
+ .init = fl_init,
+ .destroy = fl_destroy,
+ .get = fl_get,
+ .change = fl_change,
+ .delete = fl_delete,
+ .walk = fl_walk,
+ .reoffload = fl_reoffload,
+ .dump = fl_dump,
+ .bind_class = fl_bind_class,
+ .tmplt_create = fl_tmplt_create,
+ .tmplt_destroy = fl_tmplt_destroy,
+ .tmplt_dump = fl_tmplt_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_fl_init(void)
+{
+ return register_tcf_proto_ops(&cls_fl_ops);
+}
+
+static void __exit cls_fl_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_fl_ops);
+}
+
+module_init(cls_fl_init);
+module_exit(cls_fl_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("Flower classifier");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
new file mode 100644
index 000000000..cb2c62605
--- /dev/null
+++ b/net/sched/cls_fw.c
@@ -0,0 +1,474 @@
+/*
+ * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
+ * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <net/sch_generic.h>
+
+#define HTSIZE 256
+
+struct fw_head {
+ u32 mask;
+ struct fw_filter __rcu *ht[HTSIZE];
+ struct rcu_head rcu;
+};
+
+struct fw_filter {
+ struct fw_filter __rcu *next;
+ u32 id;
+ struct tcf_result res;
+#ifdef CONFIG_NET_CLS_IND
+ int ifindex;
+#endif /* CONFIG_NET_CLS_IND */
+ struct tcf_exts exts;
+ struct tcf_proto *tp;
+ struct rcu_work rwork;
+};
+
+static u32 fw_hash(u32 handle)
+{
+ handle ^= (handle >> 16);
+ handle ^= (handle >> 8);
+ return handle % HTSIZE;
+}
+
+static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct fw_head *head = rcu_dereference_bh(tp->root);
+ struct fw_filter *f;
+ int r;
+ u32 id = skb->mark;
+
+ if (head != NULL) {
+ id &= head->mask;
+
+ for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
+ f = rcu_dereference_bh(f->next)) {
+ if (f->id == id) {
+ *res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+ if (!tcf_match_indev(skb, f->ifindex))
+ continue;
+#endif /* CONFIG_NET_CLS_IND */
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+
+ return r;
+ }
+ }
+ } else {
+ struct Qdisc *q = tcf_block_q(tp->chain->block);
+
+ /* Old method: classify the packet using its skb mark. */
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id ^ q->handle)))) {
+ res->classid = id;
+ res->class = 0;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+static void *fw_get(struct tcf_proto *tp, u32 handle)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f;
+
+ if (head == NULL)
+ return NULL;
+
+ f = rtnl_dereference(head->ht[fw_hash(handle)]);
+ for (; f; f = rtnl_dereference(f->next)) {
+ if (f->id == handle)
+ return f;
+ }
+ return NULL;
+}
+
+static int fw_init(struct tcf_proto *tp)
+{
+ /* We don't allocate fw_head here, because in the old method
+ * we don't need it at all.
+ */
+ return 0;
+}
+
+static void __fw_delete_filter(struct fw_filter *f)
+{
+ tcf_exts_destroy(&f->exts);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void fw_delete_filter_work(struct work_struct *work)
+{
+ struct fw_filter *f = container_of(to_rcu_work(work),
+ struct fw_filter,
+ rwork);
+ rtnl_lock();
+ __fw_delete_filter(f);
+ rtnl_unlock();
+}
+
+static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f;
+ int h;
+
+ if (head == NULL)
+ return;
+
+ for (h = 0; h < HTSIZE; h++) {
+ while ((f = rtnl_dereference(head->ht[h])) != NULL) {
+ RCU_INIT_POINTER(head->ht[h],
+ rtnl_dereference(f->next));
+ tcf_unbind_filter(tp, &f->res);
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
+ else
+ __fw_delete_filter(f);
+ }
+ }
+ kfree_rcu(head, rcu);
+}
+
+static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = arg;
+ struct fw_filter __rcu **fp;
+ struct fw_filter *pfp;
+ int ret = -EINVAL;
+ int h;
+
+ if (head == NULL || f == NULL)
+ goto out;
+
+ fp = &head->ht[fw_hash(f->id)];
+
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == f) {
+ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
+ tcf_unbind_filter(tp, &f->res);
+ tcf_exts_get_net(&f->exts);
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
+ ret = 0;
+ break;
+ }
+ }
+
+ *last = true;
+ for (h = 0; h < HTSIZE; h++) {
+ if (rcu_access_pointer(head->ht[h])) {
+ *last = false;
+ break;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
+ [TCA_FW_CLASSID] = { .type = NLA_U32 },
+ [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_FW_MASK] = { .type = NLA_U32 },
+};
+
+static int fw_set_parms(struct net *net, struct tcf_proto *tp,
+ struct fw_filter *f, struct nlattr **tb,
+ struct nlattr **tca, unsigned long base, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ u32 mask;
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
+ extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FW_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_FW_INDEV]) {
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
+ if (ret < 0)
+ return ret;
+ f->ifindex = ret;
+ }
+#endif /* CONFIG_NET_CLS_IND */
+
+ err = -EINVAL;
+ if (tb[TCA_FW_MASK]) {
+ mask = nla_get_u32(tb[TCA_FW_MASK]);
+ if (mask != head->mask)
+ return err;
+ } else if (head->mask != 0xFFFFFFFF)
+ return err;
+
+ return 0;
+}
+
+static int fw_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca, void **arg,
+ bool ovr, struct netlink_ext_ack *extack)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = *arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_FW_MAX + 1];
+ int err;
+
+ if (!opt)
+ return handle ? -EINVAL : 0; /* Succeed if it is old method. */
+
+ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (f) {
+ struct fw_filter *pfp, *fnew;
+ struct fw_filter __rcu **fp;
+
+ if (f->id != handle && handle)
+ return -EINVAL;
+
+ fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ fnew->id = f->id;
+ fnew->res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+ fnew->ifindex = f->ifindex;
+#endif /* CONFIG_NET_CLS_IND */
+ fnew->tp = f->tp;
+
+ err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ if (err < 0) {
+ kfree(fnew);
+ return err;
+ }
+
+ err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack);
+ if (err < 0) {
+ tcf_exts_destroy(&fnew->exts);
+ kfree(fnew);
+ return err;
+ }
+
+ fp = &head->ht[fw_hash(fnew->id)];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp))
+ if (pfp == f)
+ break;
+
+ RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next));
+ rcu_assign_pointer(*fp, fnew);
+ tcf_unbind_filter(tp, &f->res);
+ tcf_exts_get_net(&f->exts);
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
+
+ *arg = fnew;
+ return err;
+ }
+
+ if (!handle)
+ return -EINVAL;
+
+ if (!head) {
+ u32 mask = 0xFFFFFFFF;
+ if (tb[TCA_FW_MASK])
+ mask = nla_get_u32(tb[TCA_FW_MASK]);
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ return -ENOBUFS;
+ head->mask = mask;
+
+ rcu_assign_pointer(tp->root, head);
+ }
+
+ f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOBUFS;
+
+ err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ if (err < 0)
+ goto errout;
+ f->id = handle;
+ f->tp = tp;
+
+ err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack);
+ if (err < 0)
+ goto errout;
+
+ RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]);
+ rcu_assign_pointer(head->ht[fw_hash(handle)], f);
+
+ *arg = f;
+ return 0;
+
+errout:
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+ return err;
+}
+
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ int h;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < HTSIZE; h++) {
+ struct fw_filter *f;
+
+ for (f = rtnl_dereference(head->ht[h]); f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->id;
+
+ if (!f->res.classid && !tcf_exts_has_actions(&f->exts))
+ return skb->len;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
+ goto nla_put_failure;
+#ifdef CONFIG_NET_CLS_IND
+ if (f->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, f->ifindex);
+ if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+#endif /* CONFIG_NET_CLS_IND */
+ if (head->mask != 0xFFFFFFFF &&
+ nla_put_u32(skb, TCA_FW_MASK, head->mask))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct fw_filter *f = fh;
+
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static struct tcf_proto_ops cls_fw_ops __read_mostly = {
+ .kind = "fw",
+ .classify = fw_classify,
+ .init = fw_init,
+ .destroy = fw_destroy,
+ .get = fw_get,
+ .change = fw_change,
+ .delete = fw_delete,
+ .walk = fw_walk,
+ .dump = fw_dump,
+ .bind_class = fw_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_fw(void)
+{
+ return register_tcf_proto_ops(&cls_fw_ops);
+}
+
+static void __exit exit_fw(void)
+{
+ unregister_tcf_proto_ops(&cls_fw_ops);
+}
+
+module_init(init_fw)
+module_exit(exit_fw)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
new file mode 100644
index 000000000..74863b0ff
--- /dev/null
+++ b/net/sched/cls_matchall.c
@@ -0,0 +1,357 @@
+/*
+ * net/sched/cls_matchll.c Match-all classifier
+ *
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+
+struct cls_mall_head {
+ struct tcf_exts exts;
+ struct tcf_result res;
+ u32 handle;
+ u32 flags;
+ unsigned int in_hw_count;
+ struct rcu_work rwork;
+};
+
+static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_mall_head *head = rcu_dereference_bh(tp->root);
+
+ if (tc_skip_sw(head->flags))
+ return -1;
+
+ *res = head->res;
+ return tcf_exts_exec(skb, &head->exts, res);
+}
+
+static int mall_init(struct tcf_proto *tp)
+{
+ return 0;
+}
+
+static void __mall_destroy(struct cls_mall_head *head)
+{
+ tcf_exts_destroy(&head->exts);
+ tcf_exts_put_net(&head->exts);
+ kfree(head);
+}
+
+static void mall_destroy_work(struct work_struct *work)
+{
+ struct cls_mall_head *head = container_of(to_rcu_work(work),
+ struct cls_mall_head,
+ rwork);
+ rtnl_lock();
+ __mall_destroy(head);
+ rtnl_unlock();
+}
+
+static void mall_destroy_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_head *head,
+ unsigned long cookie,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+ cls_mall.command = TC_CLSMATCHALL_DESTROY;
+ cls_mall.cookie = cookie;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+ tcf_block_offload_dec(block, &head->flags);
+}
+
+static int mall_replace_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_head *head,
+ unsigned long cookie,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+ bool skip_sw = tc_skip_sw(head->flags);
+ int err;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+ cls_mall.command = TC_CLSMATCHALL_REPLACE;
+ cls_mall.exts = &head->exts;
+ cls_mall.cookie = cookie;
+
+ err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL,
+ &cls_mall, skip_sw);
+ if (err < 0) {
+ mall_destroy_hw_filter(tp, head, cookie, NULL);
+ return err;
+ } else if (err > 0) {
+ head->in_hw_count = err;
+ tcf_block_offload_inc(block, &head->flags);
+ }
+
+ if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ if (!head)
+ return;
+
+ tcf_unbind_filter(tp, &head->res);
+
+ if (!tc_skip_hw(head->flags))
+ mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
+
+ if (tcf_exts_get_net(&head->exts))
+ tcf_queue_work(&head->rwork, mall_destroy_work);
+ else
+ __mall_destroy(head);
+}
+
+static void *mall_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ if (head && head->handle == handle)
+ return head;
+
+ return NULL;
+}
+
+static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
+ [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
+ [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
+};
+
+static int mall_set_parms(struct net *net, struct tcf_proto *tp,
+ struct cls_mall_head *head,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_MATCHALL_CLASSID]) {
+ head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
+ tcf_bind_filter(tp, &head->res, base);
+ }
+ return 0;
+}
+
+static int mall_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ void **arg, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+ struct cls_mall_head *new;
+ u32 flags = 0;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ if (head)
+ return -EEXIST;
+
+ err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS],
+ mall_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_MATCHALL_FLAGS]) {
+ flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+ if (!tc_flags_valid(flags))
+ return -EINVAL;
+ }
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOBUFS;
+
+ err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+ if (err)
+ goto err_exts_init;
+
+ if (!handle)
+ handle = 1;
+ new->handle = handle;
+ new->flags = flags;
+
+ err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
+ extack);
+ if (err)
+ goto err_set_parms;
+
+ if (!tc_skip_hw(new->flags)) {
+ err = mall_replace_hw_filter(tp, new, (unsigned long)new,
+ extack);
+ if (err)
+ goto err_replace_hw_filter;
+ }
+
+ if (!tc_in_hw(new->flags))
+ new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
+ *arg = head;
+ rcu_assign_pointer(tp->root, new);
+ return 0;
+
+err_replace_hw_filter:
+err_set_parms:
+ tcf_exts_destroy(&new->exts);
+err_exts_init:
+ kfree(new);
+ return err;
+}
+
+static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, head, arg) < 0)
+ arg->stop = 1;
+skip:
+ arg->count++;
+}
+
+static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+ int err;
+
+ if (tc_skip_hw(head->flags))
+ return 0;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+ cls_mall.command = add ?
+ TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
+ cls_mall.exts = &head->exts;
+ cls_mall.cookie = (unsigned long)head;
+
+ err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(head->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+
+ return 0;
+}
+
+static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_mall_head *head = fh;
+ struct nlattr *nest;
+
+ if (!head)
+ return skb->len;
+
+ t->tcm_handle = head->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (head->res.classid &&
+ nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
+ goto nla_put_failure;
+
+ if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &head->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct cls_mall_head *head = fh;
+
+ if (head && head->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &head->res, base);
+ else
+ __tcf_unbind_filter(q, &head->res);
+ }
+}
+
+static struct tcf_proto_ops cls_mall_ops __read_mostly = {
+ .kind = "matchall",
+ .classify = mall_classify,
+ .init = mall_init,
+ .destroy = mall_destroy,
+ .get = mall_get,
+ .change = mall_change,
+ .delete = mall_delete,
+ .walk = mall_walk,
+ .reoffload = mall_reoffload,
+ .dump = mall_dump,
+ .bind_class = mall_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_mall_init(void)
+{
+ return register_tcf_proto_ops(&cls_mall_ops);
+}
+
+static void __exit cls_mall_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_mall_ops);
+}
+
+module_init(cls_mall_init);
+module_exit(cls_mall_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Match-all classifier");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
new file mode 100644
index 000000000..0256777b8
--- /dev/null
+++ b/net/sched/cls_route.c
@@ -0,0 +1,687 @@
+/*
+ * net/sched/cls_route.c ROUTE4 classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+/*
+ * 1. For now we assume that route tags < 256.
+ * It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ * are mutually exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+struct route4_fastmap {
+ struct route4_filter *filter;
+ u32 id;
+ int iif;
+};
+
+struct route4_head {
+ struct route4_fastmap fastmap[16];
+ struct route4_bucket __rcu *table[256 + 1];
+ struct rcu_head rcu;
+};
+
+struct route4_bucket {
+ /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
+ struct route4_filter __rcu *ht[16 + 16 + 1];
+ struct rcu_head rcu;
+};
+
+struct route4_filter {
+ struct route4_filter __rcu *next;
+ u32 id;
+ int iif;
+
+ struct tcf_result res;
+ struct tcf_exts exts;
+ u32 handle;
+ struct route4_bucket *bkt;
+ struct tcf_proto *tp;
+ struct rcu_work rwork;
+};
+
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
+
+static inline int route4_fastmap_hash(u32 id, int iif)
+{
+ return id & 0xF;
+}
+
+static DEFINE_SPINLOCK(fastmap_lock);
+static void
+route4_reset_fastmap(struct route4_head *head)
+{
+ spin_lock_bh(&fastmap_lock);
+ memset(head->fastmap, 0, sizeof(head->fastmap));
+ spin_unlock_bh(&fastmap_lock);
+}
+
+static void
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+ struct route4_filter *f)
+{
+ int h = route4_fastmap_hash(id, iif);
+
+ /* fastmap updates must look atomic to aling id, iff, filter */
+ spin_lock_bh(&fastmap_lock);
+ head->fastmap[h].id = id;
+ head->fastmap[h].iif = iif;
+ head->fastmap[h].filter = f;
+ spin_unlock_bh(&fastmap_lock);
+}
+
+static inline int route4_hash_to(u32 id)
+{
+ return id & 0xFF;
+}
+
+static inline int route4_hash_from(u32 id)
+{
+ return (id >> 16) & 0xF;
+}
+
+static inline int route4_hash_iif(int iif)
+{
+ return 16 + ((iif >> 16) & 0xF);
+}
+
+static inline int route4_hash_wild(void)
+{
+ return 32;
+}
+
+#define ROUTE4_APPLY_RESULT() \
+{ \
+ *res = f->res; \
+ if (tcf_exts_has_actions(&f->exts)) { \
+ int r = tcf_exts_exec(skb, &f->exts, res); \
+ if (r < 0) { \
+ dont_cache = 1; \
+ continue; \
+ } \
+ return r; \
+ } else if (!dont_cache) \
+ route4_set_fastmap(head, id, iif, f); \
+ return 0; \
+}
+
+static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct route4_head *head = rcu_dereference_bh(tp->root);
+ struct dst_entry *dst;
+ struct route4_bucket *b;
+ struct route4_filter *f;
+ u32 id, h;
+ int iif, dont_cache = 0;
+
+ dst = skb_dst(skb);
+ if (!dst)
+ goto failure;
+
+ id = dst->tclassid;
+
+ iif = inet_iif(skb);
+
+ h = route4_fastmap_hash(id, iif);
+
+ spin_lock(&fastmap_lock);
+ if (id == head->fastmap[h].id &&
+ iif == head->fastmap[h].iif &&
+ (f = head->fastmap[h].filter) != NULL) {
+ if (f == ROUTE4_FAILURE) {
+ spin_unlock(&fastmap_lock);
+ goto failure;
+ }
+
+ *res = f->res;
+ spin_unlock(&fastmap_lock);
+ return 0;
+ }
+ spin_unlock(&fastmap_lock);
+
+ h = route4_hash_to(id);
+
+restart:
+ b = rcu_dereference_bh(head->table[h]);
+ if (b) {
+ for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ if (f->id == id)
+ ROUTE4_APPLY_RESULT();
+
+ for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ if (f->iif == iif)
+ ROUTE4_APPLY_RESULT();
+
+ for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ ROUTE4_APPLY_RESULT();
+ }
+ if (h < 256) {
+ h = 256;
+ id &= ~0xFFFF;
+ goto restart;
+ }
+
+ if (!dont_cache)
+ route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+ return -1;
+}
+
+static inline u32 to_hash(u32 id)
+{
+ u32 h = id & 0xFF;
+
+ if (id & 0x8000)
+ h += 256;
+ return h;
+}
+
+static inline u32 from_hash(u32 id)
+{
+ id &= 0xFFFF;
+ if (id == 0xFFFF)
+ return 32;
+ if (!(id & 0x8000)) {
+ if (id > 255)
+ return 256;
+ return id & 0xF;
+ }
+ return 16 + (id & 0xF);
+}
+
+static void *route4_get(struct tcf_proto *tp, u32 handle)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_bucket *b;
+ struct route4_filter *f;
+ unsigned int h1, h2;
+
+ h1 = to_hash(handle);
+ if (h1 > 256)
+ return NULL;
+
+ h2 = from_hash(handle >> 16);
+ if (h2 > 32)
+ return NULL;
+
+ b = rtnl_dereference(head->table[h1]);
+ if (b) {
+ for (f = rtnl_dereference(b->ht[h2]);
+ f;
+ f = rtnl_dereference(f->next))
+ if (f->handle == handle)
+ return f;
+ }
+ return NULL;
+}
+
+static int route4_init(struct tcf_proto *tp)
+{
+ struct route4_head *head;
+
+ head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void __route4_delete_filter(struct route4_filter *f)
+{
+ tcf_exts_destroy(&f->exts);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void route4_delete_filter_work(struct work_struct *work)
+{
+ struct route4_filter *f = container_of(to_rcu_work(work),
+ struct route4_filter,
+ rwork);
+ rtnl_lock();
+ __route4_delete_filter(f);
+ rtnl_unlock();
+}
+
+static void route4_queue_work(struct route4_filter *f)
+{
+ tcf_queue_work(&f->rwork, route4_delete_filter_work);
+}
+
+static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ int h1, h2;
+
+ if (head == NULL)
+ return;
+
+ for (h1 = 0; h1 <= 256; h1++) {
+ struct route4_bucket *b;
+
+ b = rtnl_dereference(head->table[h1]);
+ if (b) {
+ for (h2 = 0; h2 <= 32; h2++) {
+ struct route4_filter *f;
+
+ while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
+ struct route4_filter *next;
+
+ next = rtnl_dereference(f->next);
+ RCU_INIT_POINTER(b->ht[h2], next);
+ tcf_unbind_filter(tp, &f->res);
+ if (tcf_exts_get_net(&f->exts))
+ route4_queue_work(f);
+ else
+ __route4_delete_filter(f);
+ }
+ }
+ RCU_INIT_POINTER(head->table[h1], NULL);
+ kfree_rcu(b, rcu);
+ }
+ }
+ kfree_rcu(head, rcu);
+}
+
+static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_filter *f = arg;
+ struct route4_filter __rcu **fp;
+ struct route4_filter *nf;
+ struct route4_bucket *b;
+ unsigned int h = 0;
+ int i, h1;
+
+ if (!head || !f)
+ return -EINVAL;
+
+ h = f->handle;
+ b = f->bkt;
+
+ fp = &b->ht[from_hash(h >> 16)];
+ for (nf = rtnl_dereference(*fp); nf;
+ fp = &nf->next, nf = rtnl_dereference(*fp)) {
+ if (nf == f) {
+ /* unlink it */
+ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
+
+ /* Remove any fastmap lookups that might ref filter
+ * notice we unlink'd the filter so we can't get it
+ * back in the fastmap.
+ */
+ route4_reset_fastmap(head);
+
+ /* Delete it */
+ tcf_unbind_filter(tp, &f->res);
+ tcf_exts_get_net(&f->exts);
+ tcf_queue_work(&f->rwork, route4_delete_filter_work);
+
+ /* Strip RTNL protected tree */
+ for (i = 0; i <= 32; i++) {
+ struct route4_filter *rt;
+
+ rt = rtnl_dereference(b->ht[i]);
+ if (rt)
+ goto out;
+ }
+
+ /* OK, session has no flows */
+ RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
+ kfree_rcu(b, rcu);
+ break;
+ }
+ }
+
+out:
+ *last = true;
+ for (h1 = 0; h1 <= 256; h1++) {
+ if (rcu_access_pointer(head->table[h1])) {
+ *last = false;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
+ [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
+ [TCA_ROUTE4_TO] = { .type = NLA_U32 },
+ [TCA_ROUTE4_FROM] = { .type = NLA_U32 },
+ [TCA_ROUTE4_IIF] = { .type = NLA_U32 },
+};
+
+static int route4_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct route4_filter *f,
+ u32 handle, struct route4_head *head,
+ struct nlattr **tb, struct nlattr *est, int new,
+ bool ovr, struct netlink_ext_ack *extack)
+{
+ u32 id = 0, to = 0, nhandle = 0x8000;
+ struct route4_filter *fp;
+ unsigned int h1;
+ struct route4_bucket *b;
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_ROUTE4_TO]) {
+ if (new && handle & 0x8000)
+ return -EINVAL;
+ to = nla_get_u32(tb[TCA_ROUTE4_TO]);
+ if (to > 0xFF)
+ return -EINVAL;
+ nhandle = to;
+ }
+
+ if (tb[TCA_ROUTE4_FROM]) {
+ if (tb[TCA_ROUTE4_IIF])
+ return -EINVAL;
+ id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
+ if (id > 0xFF)
+ return -EINVAL;
+ nhandle |= id << 16;
+ } else if (tb[TCA_ROUTE4_IIF]) {
+ id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
+ if (id > 0x7FFF)
+ return -EINVAL;
+ nhandle |= (id | 0x8000) << 16;
+ } else
+ nhandle |= 0xFFFF << 16;
+
+ if (handle && new) {
+ nhandle |= handle & 0x7F00;
+ if (nhandle != handle)
+ return -EINVAL;
+ }
+
+ h1 = to_hash(nhandle);
+ b = rtnl_dereference(head->table[h1]);
+ if (!b) {
+ b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+ if (b == NULL)
+ return -ENOBUFS;
+
+ rcu_assign_pointer(head->table[h1], b);
+ } else {
+ unsigned int h2 = from_hash(nhandle >> 16);
+
+ for (fp = rtnl_dereference(b->ht[h2]);
+ fp;
+ fp = rtnl_dereference(fp->next))
+ if (fp->handle == f->handle)
+ return -EEXIST;
+ }
+
+ if (tb[TCA_ROUTE4_TO])
+ f->id = to;
+
+ if (tb[TCA_ROUTE4_FROM])
+ f->id = to | id<<16;
+ else if (tb[TCA_ROUTE4_IIF])
+ f->iif = id;
+
+ f->handle = nhandle;
+ f->bkt = b;
+ f->tp = tp;
+
+ if (tb[TCA_ROUTE4_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ return 0;
+}
+
+static int route4_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, void **arg, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_filter __rcu **fp;
+ struct route4_filter *fold, *f1, *pfp, *f = NULL;
+ struct route4_bucket *b;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ROUTE4_MAX + 1];
+ unsigned int h, th;
+ int err;
+ bool new = true;
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy, NULL);
+ if (err < 0)
+ return err;
+
+ fold = *arg;
+ if (fold && handle && fold->handle != handle)
+ return -EINVAL;
+
+ err = -ENOBUFS;
+ f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
+ if (!f)
+ goto errout;
+
+ err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ if (err < 0)
+ goto errout;
+
+ if (fold) {
+ f->id = fold->id;
+ f->iif = fold->iif;
+ f->res = fold->res;
+ f->handle = fold->handle;
+
+ f->tp = fold->tp;
+ f->bkt = fold->bkt;
+ new = false;
+ }
+
+ err = route4_set_parms(net, tp, base, f, handle, head, tb,
+ tca[TCA_RATE], new, ovr, extack);
+ if (err < 0)
+ goto errout;
+
+ h = from_hash(f->handle >> 16);
+ fp = &f->bkt->ht[h];
+ for (pfp = rtnl_dereference(*fp);
+ (f1 = rtnl_dereference(*fp)) != NULL;
+ fp = &f1->next)
+ if (f->handle < f1->handle)
+ break;
+
+ tcf_block_netif_keep_dst(tp->chain->block);
+ rcu_assign_pointer(f->next, f1);
+ rcu_assign_pointer(*fp, f);
+
+ if (fold && fold->handle && f->handle != fold->handle) {
+ th = to_hash(fold->handle);
+ h = from_hash(fold->handle >> 16);
+ b = rtnl_dereference(head->table[th]);
+ if (b) {
+ fp = &b->ht[h];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == fold) {
+ rcu_assign_pointer(*fp, fold->next);
+ break;
+ }
+ }
+ }
+ }
+
+ route4_reset_fastmap(head);
+ *arg = f;
+ if (fold) {
+ tcf_unbind_filter(tp, &fold->res);
+ tcf_exts_get_net(&fold->exts);
+ tcf_queue_work(&fold->rwork, route4_delete_filter_work);
+ }
+ return 0;
+
+errout:
+ if (f)
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+ return err;
+}
+
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ unsigned int h, h1;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h <= 256; h++) {
+ struct route4_bucket *b = rtnl_dereference(head->table[h]);
+
+ if (b) {
+ for (h1 = 0; h1 <= 32; h1++) {
+ struct route4_filter *f;
+
+ for (f = rtnl_dereference(b->ht[h1]);
+ f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+ }
+}
+
+static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct route4_filter *f = fh;
+ struct nlattr *nest;
+ u32 id;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (!(f->handle & 0x8000)) {
+ id = f->id & 0xFF;
+ if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
+ goto nla_put_failure;
+ }
+ if (f->handle & 0x80000000) {
+ if ((f->handle >> 16) != 0xFFFF &&
+ nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
+ goto nla_put_failure;
+ } else {
+ id = f->id >> 16;
+ if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
+ goto nla_put_failure;
+ }
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct route4_filter *f = fh;
+
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static struct tcf_proto_ops cls_route4_ops __read_mostly = {
+ .kind = "route",
+ .classify = route4_classify,
+ .init = route4_init,
+ .destroy = route4_destroy,
+ .get = route4_get,
+ .change = route4_change,
+ .delete = route4_delete,
+ .walk = route4_walk,
+ .dump = route4_dump,
+ .bind_class = route4_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_route4(void)
+{
+ return register_tcf_proto_ops(&cls_route4_ops);
+}
+
+static void __exit exit_route4(void)
+{
+ unregister_tcf_proto_ops(&cls_route4_ops);
+}
+
+module_init(init_route4)
+module_exit(exit_route4)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
new file mode 100644
index 000000000..cbb5e0d60
--- /dev/null
+++ b/net/sched/cls_rsvp.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+#define RSVP_DST_LEN 1
+#define RSVP_ID "rsvp"
+#define RSVP_OPS cls_rsvp_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
new file mode 100644
index 000000000..eb1dd2afc
--- /dev/null
+++ b/net/sched/cls_rsvp.h
@@ -0,0 +1,775 @@
+/*
+ * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+/*
+ Comparing to general packet classification problem,
+ RSVP needs only sevaral relatively simple rules:
+
+ * (dst, protocol) are always specified,
+ so that we are able to hash them.
+ * src may be exact, or may be wildcard, so that
+ we can keep a hash table plus one wildcard entry.
+ * source port (or flow label) is important only if src is given.
+
+ IMPLEMENTATION.
+
+ We use a two level hash table: The top level is keyed by
+ destination address and protocol ID, every bucket contains a list
+ of "rsvp sessions", identified by destination address, protocol and
+ DPI(="Destination Port ID"): triple (key, mask, offset).
+
+ Every bucket has a smaller hash table keyed by source address
+ (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
+ Every bucket is again a list of "RSVP flows", selected by
+ source address and SPI(="Source Port ID" here rather than
+ "security parameter index"): triple (key, mask, offset).
+
+
+ NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
+ and all fragmented packets go to the best-effort traffic class.
+
+
+ NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
+ only one "Generalized Port Identifier". So that for classic
+ ah, esp (and udp,tcp) both *pi should coincide or one of them
+ should be wildcard.
+
+ At first sight, this redundancy is just a waste of CPU
+ resources. But DPI and SPI add the possibility to assign different
+ priorities to GPIs. Look also at note 4 about tunnels below.
+
+
+ NOTE 3. One complication is the case of tunneled packets.
+ We implement it as following: if the first lookup
+ matches a special session with "tunnelhdr" value not zero,
+ flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
+ In this case, we pull tunnelhdr bytes and restart lookup
+ with tunnel ID added to the list of keys. Simple and stupid 8)8)
+ It's enough for PIMREG and IPIP.
+
+
+ NOTE 4. Two GPIs make it possible to parse even GRE packets.
+ F.e. DPI can select ETH_P_IP (and necessary flags to make
+ tunnelhdr correct) in GRE protocol field and SPI matches
+ GRE key. Is it not nice? 8)8)
+
+
+ Well, as result, despite its simplicity, we get a pretty
+ powerful classification engine. */
+
+
+struct rsvp_head {
+ u32 tmap[256/32];
+ u32 hgenerator;
+ u8 tgenerator;
+ struct rsvp_session __rcu *ht[256];
+ struct rcu_head rcu;
+};
+
+struct rsvp_session {
+ struct rsvp_session __rcu *next;
+ __be32 dst[RSVP_DST_LEN];
+ struct tc_rsvp_gpi dpi;
+ u8 protocol;
+ u8 tunnelid;
+ /* 16 (src,sport) hash slots, and one wildcard source slot */
+ struct rsvp_filter __rcu *ht[16 + 1];
+ struct rcu_head rcu;
+};
+
+
+struct rsvp_filter {
+ struct rsvp_filter __rcu *next;
+ __be32 src[RSVP_DST_LEN];
+ struct tc_rsvp_gpi spi;
+ u8 tunnelhdr;
+
+ struct tcf_result res;
+ struct tcf_exts exts;
+
+ u32 handle;
+ struct rsvp_session *sess;
+ struct rcu_work rwork;
+};
+
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+{
+ unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
+ h ^= h>>16;
+ h ^= h>>8;
+ return (h ^ protocol ^ tunnelid) & 0xFF;
+}
+
+static inline unsigned int hash_src(__be32 *src)
+{
+ unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
+ h ^= h>>16;
+ h ^= h>>8;
+ h ^= h>>4;
+ return h & 0xF;
+}
+
+#define RSVP_APPLY_RESULT() \
+{ \
+ int r = tcf_exts_exec(skb, &f->exts, res); \
+ if (r < 0) \
+ continue; \
+ else if (r > 0) \
+ return r; \
+}
+
+static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct rsvp_head *head = rcu_dereference_bh(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter *f;
+ unsigned int h1, h2;
+ __be32 *dst, *src;
+ u8 protocol;
+ u8 tunnelid = 0;
+ u8 *xprt;
+#if RSVP_DST_LEN == 4
+ struct ipv6hdr *nhptr;
+
+ if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+ return -1;
+ nhptr = ipv6_hdr(skb);
+#else
+ struct iphdr *nhptr;
+
+ if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+ return -1;
+ nhptr = ip_hdr(skb);
+#endif
+restart:
+
+#if RSVP_DST_LEN == 4
+ src = &nhptr->saddr.s6_addr32[0];
+ dst = &nhptr->daddr.s6_addr32[0];
+ protocol = nhptr->nexthdr;
+ xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
+#else
+ src = &nhptr->saddr;
+ dst = &nhptr->daddr;
+ protocol = nhptr->protocol;
+ xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+ if (ip_is_fragment(nhptr))
+ return -1;
+#endif
+
+ h1 = hash_dst(dst, protocol, tunnelid);
+ h2 = hash_src(src);
+
+ for (s = rcu_dereference_bh(head->ht[h1]); s;
+ s = rcu_dereference_bh(s->next)) {
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
+ protocol == s->protocol &&
+ !(s->dpi.mask &
+ (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
+#if RSVP_DST_LEN == 4
+ dst[0] == s->dst[0] &&
+ dst[1] == s->dst[1] &&
+ dst[2] == s->dst[2] &&
+#endif
+ tunnelid == s->tunnelid) {
+
+ for (f = rcu_dereference_bh(s->ht[h2]); f;
+ f = rcu_dereference_bh(f->next)) {
+ if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+ !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
+#if RSVP_DST_LEN == 4
+ &&
+ src[0] == f->src[0] &&
+ src[1] == f->src[1] &&
+ src[2] == f->src[2]
+#endif
+ ) {
+ *res = f->res;
+ RSVP_APPLY_RESULT();
+
+matched:
+ if (f->tunnelhdr == 0)
+ return 0;
+
+ tunnelid = f->res.classid;
+ nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
+ goto restart;
+ }
+ }
+
+ /* And wildcard bucket... */
+ for (f = rcu_dereference_bh(s->ht[16]); f;
+ f = rcu_dereference_bh(f->next)) {
+ *res = f->res;
+ RSVP_APPLY_RESULT();
+ goto matched;
+ }
+ return -1;
+ }
+ }
+ return -1;
+}
+
+static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter __rcu **ins;
+ struct rsvp_filter *pins;
+ unsigned int h1 = h & 0xFF;
+ unsigned int h2 = (h >> 8) & 0xFF;
+
+ for (s = rtnl_dereference(head->ht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
+ ins = &pins->next, pins = rtnl_dereference(*ins)) {
+ if (pins->handle == h) {
+ RCU_INIT_POINTER(n->next, pins->next);
+ rcu_assign_pointer(*ins, n);
+ return;
+ }
+ }
+ }
+
+ /* Something went wrong if we are trying to replace a non-existant
+ * node. Mind as well halt instead of silently failing.
+ */
+ BUG_ON(1);
+}
+
+static void *rsvp_get(struct tcf_proto *tp, u32 handle)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter *f;
+ unsigned int h1 = handle & 0xFF;
+ unsigned int h2 = (handle >> 8) & 0xFF;
+
+ if (h2 > 16)
+ return NULL;
+
+ for (s = rtnl_dereference(head->ht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
+ if (f->handle == handle)
+ return f;
+ }
+ }
+ return NULL;
+}
+
+static int rsvp_init(struct tcf_proto *tp)
+{
+ struct rsvp_head *data;
+
+ data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
+ if (data) {
+ rcu_assign_pointer(tp->root, data);
+ return 0;
+ }
+ return -ENOBUFS;
+}
+
+static void __rsvp_delete_filter(struct rsvp_filter *f)
+{
+ tcf_exts_destroy(&f->exts);
+ tcf_exts_put_net(&f->exts);
+ kfree(f);
+}
+
+static void rsvp_delete_filter_work(struct work_struct *work)
+{
+ struct rsvp_filter *f = container_of(to_rcu_work(work),
+ struct rsvp_filter,
+ rwork);
+ rtnl_lock();
+ __rsvp_delete_filter(f);
+ rtnl_unlock();
+}
+
+static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+{
+ tcf_unbind_filter(tp, &f->res);
+ /* all classifiers are required to call tcf_exts_destroy() after rcu
+ * grace period, since converted-to-rcu actions are relying on that
+ * in cleanup() callback
+ */
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
+ else
+ __rsvp_delete_filter(f);
+}
+
+static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ int h1, h2;
+
+ if (data == NULL)
+ return;
+
+ for (h1 = 0; h1 < 256; h1++) {
+ struct rsvp_session *s;
+
+ while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
+ RCU_INIT_POINTER(data->ht[h1], s->next);
+
+ for (h2 = 0; h2 <= 16; h2++) {
+ struct rsvp_filter *f;
+
+ while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
+ rcu_assign_pointer(s->ht[h2], f->next);
+ rsvp_delete_filter(tp, f);
+ }
+ }
+ kfree_rcu(s, rcu);
+ }
+ }
+ kfree_rcu(data, rcu);
+}
+
+static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_filter *nfp, *f = arg;
+ struct rsvp_filter __rcu **fp;
+ unsigned int h = f->handle;
+ struct rsvp_session __rcu **sp;
+ struct rsvp_session *nsp, *s = f->sess;
+ int i, h1;
+
+ fp = &s->ht[(h >> 8) & 0xFF];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
+ if (nfp == f) {
+ RCU_INIT_POINTER(*fp, f->next);
+ rsvp_delete_filter(tp, f);
+
+ /* Strip tree */
+
+ for (i = 0; i <= 16; i++)
+ if (s->ht[i])
+ goto out;
+
+ /* OK, session has no flows */
+ sp = &head->ht[h & 0xFF];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if (nsp == s) {
+ RCU_INIT_POINTER(*sp, s->next);
+ kfree_rcu(s, rcu);
+ goto out;
+ }
+ }
+
+ break;
+ }
+ }
+
+out:
+ *last = true;
+ for (h1 = 0; h1 < 256; h1++) {
+ if (rcu_access_pointer(head->ht[h1])) {
+ *last = false;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ int i = 0xFFFF;
+
+ while (i-- > 0) {
+ u32 h;
+
+ if ((data->hgenerator += 0x10000) == 0)
+ data->hgenerator = 0x10000;
+ h = data->hgenerator|salt;
+ if (!rsvp_get(tp, h))
+ return h;
+ }
+ return 0;
+}
+
+static int tunnel_bts(struct rsvp_head *data)
+{
+ int n = data->tgenerator >> 5;
+ u32 b = 1 << (data->tgenerator & 0x1F);
+
+ if (data->tmap[n] & b)
+ return 0;
+ data->tmap[n] |= b;
+ return 1;
+}
+
+static void tunnel_recycle(struct rsvp_head *data)
+{
+ struct rsvp_session __rcu **sht = data->ht;
+ u32 tmap[256/32];
+ int h1, h2;
+
+ memset(tmap, 0, sizeof(tmap));
+
+ for (h1 = 0; h1 < 256; h1++) {
+ struct rsvp_session *s;
+ for (s = rtnl_dereference(sht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (h2 = 0; h2 <= 16; h2++) {
+ struct rsvp_filter *f;
+
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
+ if (f->tunnelhdr == 0)
+ continue;
+ data->tgenerator = f->res.classid;
+ tunnel_bts(data);
+ }
+ }
+ }
+ }
+
+ memcpy(data->tmap, tmap, sizeof(tmap));
+}
+
+static u32 gen_tunnel(struct rsvp_head *data)
+{
+ int i, k;
+
+ for (k = 0; k < 2; k++) {
+ for (i = 255; i > 0; i--) {
+ if (++data->tgenerator == 0)
+ data->tgenerator = 1;
+ if (tunnel_bts(data))
+ return data->tgenerator;
+ }
+ tunnel_recycle(data);
+ }
+ return 0;
+}
+
+static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
+ [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
+ [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
+};
+
+static int rsvp_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct nlattr **tca,
+ void **arg, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ struct rsvp_filter *f, *nfp;
+ struct rsvp_filter __rcu **fp;
+ struct rsvp_session *nsp, *s;
+ struct rsvp_session __rcu **sp;
+ struct tc_rsvp_pinfo *pinfo = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_RSVP_MAX + 1];
+ struct tcf_exts e;
+ unsigned int h1, h2;
+ __be32 *dst;
+ int err;
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy, NULL);
+ if (err < 0)
+ return err;
+
+ err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ if (err < 0)
+ return err;
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack);
+ if (err < 0)
+ goto errout2;
+
+ f = *arg;
+ if (f) {
+ /* Node exists: adjust only classid */
+ struct rsvp_filter *n;
+
+ if (f->handle != handle && handle)
+ goto errout2;
+
+ n = kmemdup(f, sizeof(*f), GFP_KERNEL);
+ if (!n) {
+ err = -ENOMEM;
+ goto errout2;
+ }
+
+ err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ if (err < 0) {
+ kfree(n);
+ goto errout2;
+ }
+
+ if (tb[TCA_RSVP_CLASSID]) {
+ n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
+ tcf_bind_filter(tp, &n->res, base);
+ }
+
+ tcf_exts_change(&n->exts, &e);
+ rsvp_replace(tp, n, handle);
+ return 0;
+ }
+
+ /* Now more serious part... */
+ err = -EINVAL;
+ if (handle)
+ goto errout2;
+ if (tb[TCA_RSVP_DST] == NULL)
+ goto errout2;
+
+ err = -ENOBUFS;
+ f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
+ if (f == NULL)
+ goto errout2;
+
+ err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ if (err < 0)
+ goto errout;
+ h2 = 16;
+ if (tb[TCA_RSVP_SRC]) {
+ memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
+ h2 = hash_src(f->src);
+ }
+ if (tb[TCA_RSVP_PINFO]) {
+ pinfo = nla_data(tb[TCA_RSVP_PINFO]);
+ f->spi = pinfo->spi;
+ f->tunnelhdr = pinfo->tunnelhdr;
+ }
+ if (tb[TCA_RSVP_CLASSID])
+ f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
+
+ dst = nla_data(tb[TCA_RSVP_DST]);
+ h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
+
+ err = -ENOMEM;
+ if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
+ goto errout;
+
+ if (f->tunnelhdr) {
+ err = -EINVAL;
+ if (f->res.classid > 255)
+ goto errout;
+
+ err = -ENOMEM;
+ if (f->res.classid == 0 &&
+ (f->res.classid = gen_tunnel(data)) == 0)
+ goto errout;
+ }
+
+ for (sp = &data->ht[h1];
+ (s = rtnl_dereference(*sp)) != NULL;
+ sp = &s->next) {
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+ pinfo && pinfo->protocol == s->protocol &&
+ memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
+#if RSVP_DST_LEN == 4
+ dst[0] == s->dst[0] &&
+ dst[1] == s->dst[1] &&
+ dst[2] == s->dst[2] &&
+#endif
+ pinfo->tunnelid == s->tunnelid) {
+
+insert:
+ /* OK, we found appropriate session */
+
+ fp = &s->ht[h2];
+
+ f->sess = s;
+ if (f->tunnelhdr == 0)
+ tcf_bind_filter(tp, &f->res, base);
+
+ tcf_exts_change(&f->exts, &e);
+
+ fp = &s->ht[h2];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
+ __u32 mask = nfp->spi.mask & f->spi.mask;
+
+ if (mask != f->spi.mask)
+ break;
+ }
+ RCU_INIT_POINTER(f->next, nfp);
+ rcu_assign_pointer(*fp, f);
+
+ *arg = f;
+ return 0;
+ }
+ }
+
+ /* No session found. Create new one. */
+
+ err = -ENOBUFS;
+ s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
+ if (s == NULL)
+ goto errout;
+ memcpy(s->dst, dst, sizeof(s->dst));
+
+ if (pinfo) {
+ s->dpi = pinfo->dpi;
+ s->protocol = pinfo->protocol;
+ s->tunnelid = pinfo->tunnelid;
+ }
+ sp = &data->ht[h1];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
+ break;
+ }
+ RCU_INIT_POINTER(s->next, nsp);
+ rcu_assign_pointer(*sp, s);
+
+ goto insert;
+
+errout:
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+errout2:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ unsigned int h, h1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < 256; h++) {
+ struct rsvp_session *s;
+
+ for (s = rtnl_dereference(head->ht[h]); s;
+ s = rtnl_dereference(s->next)) {
+ for (h1 = 0; h1 <= 16; h1++) {
+ struct rsvp_filter *f;
+
+ for (f = rtnl_dereference(s->ht[h1]); f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+ }
+}
+
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct rsvp_filter *f = fh;
+ struct rsvp_session *s;
+ struct nlattr *nest;
+ struct tc_rsvp_pinfo pinfo;
+
+ if (f == NULL)
+ return skb->len;
+ s = f->sess;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
+ goto nla_put_failure;
+ pinfo.dpi = s->dpi;
+ pinfo.spi = f->spi;
+ pinfo.protocol = s->protocol;
+ pinfo.tunnelid = s->tunnelid;
+ pinfo.tunnelhdr = f->tunnelhdr;
+ pinfo.pad = 0;
+ if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
+ goto nla_put_failure;
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
+ goto nla_put_failure;
+ if (((f->handle >> 8) & 0xFF) != 16 &&
+ nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct rsvp_filter *f = fh;
+
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static struct tcf_proto_ops RSVP_OPS __read_mostly = {
+ .kind = RSVP_ID,
+ .classify = rsvp_classify,
+ .init = rsvp_init,
+ .destroy = rsvp_destroy,
+ .get = rsvp_get,
+ .change = rsvp_change,
+ .delete = rsvp_delete,
+ .walk = rsvp_walk,
+ .dump = rsvp_dump,
+ .bind_class = rsvp_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_rsvp(void)
+{
+ return register_tcf_proto_ops(&RSVP_OPS);
+}
+
+static void __exit exit_rsvp(void)
+{
+ unregister_tcf_proto_ops(&RSVP_OPS);
+}
+
+module_init(init_rsvp)
+module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
new file mode 100644
index 000000000..dd08aea2a
--- /dev/null
+++ b/net/sched/cls_rsvp6.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ipv6.h>
+#include <linux/skbuff.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <net/netlink.h>
+
+#define RSVP_DST_LEN 4
+#define RSVP_ID "rsvp6"
+#define RSVP_OPS cls_rsvp6_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
new file mode 100644
index 000000000..4070197f9
--- /dev/null
+++ b/net/sched/cls_tcindex.c
@@ -0,0 +1,698 @@
+/*
+ * net/sched/cls_tcindex.c Packet classifier for skb->tc_index
+ *
+ * Written 1998,1999 by Werner Almesberger, EPFL ICA
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/sch_generic.h>
+
+/*
+ * Passing parameters to the root seems to be done more awkwardly than really
+ * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
+ * verified. FIXME.
+ */
+
+#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
+#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+
+
+struct tcindex_filter_result {
+ struct tcf_exts exts;
+ struct tcf_result res;
+ struct rcu_work rwork;
+};
+
+struct tcindex_filter {
+ u16 key;
+ struct tcindex_filter_result result;
+ struct tcindex_filter __rcu *next;
+ struct rcu_work rwork;
+};
+
+
+struct tcindex_data {
+ struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
+ struct tcindex_filter __rcu **h; /* imperfect hash; */
+ struct tcf_proto *tp;
+ u16 mask; /* AND key with mask */
+ u32 shift; /* shift ANDed key to the right */
+ u32 hash; /* hash table size; 0 if undefined */
+ u32 alloc_hash; /* allocated size */
+ u32 fall_through; /* 0: only classify if explicit match */
+ struct rcu_work rwork;
+};
+
+static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
+{
+ return tcf_exts_has_actions(&r->exts) || r->res.classid;
+}
+
+static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
+ u16 key)
+{
+ if (p->perfect) {
+ struct tcindex_filter_result *f = p->perfect + key;
+
+ return tcindex_filter_is_set(f) ? f : NULL;
+ } else if (p->h) {
+ struct tcindex_filter __rcu **fp;
+ struct tcindex_filter *f;
+
+ fp = &p->h[key % p->hash];
+ for (f = rcu_dereference_bh_rtnl(*fp);
+ f;
+ fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
+ if (f->key == key)
+ return &f->result;
+ }
+
+ return NULL;
+}
+
+
+static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct tcindex_data *p = rcu_dereference_bh(tp->root);
+ struct tcindex_filter_result *f;
+ int key = (skb->tc_index & p->mask) >> p->shift;
+
+ pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
+ skb, tp, res, p);
+
+ f = tcindex_lookup(p, key);
+ if (!f) {
+ struct Qdisc *q = tcf_block_q(tp->chain->block);
+
+ if (!p->fall_through)
+ return -1;
+ res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key);
+ res->class = 0;
+ pr_debug("alg 0x%x\n", res->classid);
+ return 0;
+ }
+ *res = f->res;
+ pr_debug("map 0x%x\n", res->classid);
+
+ return tcf_exts_exec(skb, &f->exts, res);
+}
+
+
+static void *tcindex_get(struct tcf_proto *tp, u32 handle)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r;
+
+ pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
+ if (p->perfect && handle >= p->alloc_hash)
+ return NULL;
+ r = tcindex_lookup(p, handle);
+ return r && tcindex_filter_is_set(r) ? r : NULL;
+}
+
+static int tcindex_init(struct tcf_proto *tp)
+{
+ struct tcindex_data *p;
+
+ pr_debug("tcindex_init(tp %p)\n", tp);
+ p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->mask = 0xffff;
+ p->hash = DEFAULT_HASH_SIZE;
+ p->fall_through = 1;
+
+ rcu_assign_pointer(tp->root, p);
+ return 0;
+}
+
+static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
+{
+ tcf_exts_destroy(&r->exts);
+ tcf_exts_put_net(&r->exts);
+}
+
+static void tcindex_destroy_rexts_work(struct work_struct *work)
+{
+ struct tcindex_filter_result *r;
+
+ r = container_of(to_rcu_work(work),
+ struct tcindex_filter_result,
+ rwork);
+ rtnl_lock();
+ __tcindex_destroy_rexts(r);
+ rtnl_unlock();
+}
+
+static void __tcindex_destroy_fexts(struct tcindex_filter *f)
+{
+ tcf_exts_destroy(&f->result.exts);
+ tcf_exts_put_net(&f->result.exts);
+ kfree(f);
+}
+
+static void tcindex_destroy_fexts_work(struct work_struct *work)
+{
+ struct tcindex_filter *f = container_of(to_rcu_work(work),
+ struct tcindex_filter,
+ rwork);
+
+ rtnl_lock();
+ __tcindex_destroy_fexts(f);
+ rtnl_unlock();
+}
+
+static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = arg;
+ struct tcindex_filter __rcu **walk;
+ struct tcindex_filter *f = NULL;
+
+ pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
+ if (p->perfect) {
+ if (!r->res.class)
+ return -ENOENT;
+ } else {
+ int i;
+
+ for (i = 0; i < p->hash; i++) {
+ walk = p->h + i;
+ for (f = rtnl_dereference(*walk); f;
+ walk = &f->next, f = rtnl_dereference(*walk)) {
+ if (&f->result == r)
+ goto found;
+ }
+ }
+ return -ENOENT;
+
+found:
+ rcu_assign_pointer(*walk, rtnl_dereference(f->next));
+ }
+ tcf_unbind_filter(tp, &r->res);
+ /* all classifiers are required to call tcf_exts_destroy() after rcu
+ * grace period, since converted-to-rcu actions are relying on that
+ * in cleanup() callback
+ */
+ if (f) {
+ if (tcf_exts_get_net(&f->result.exts))
+ tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
+ else
+ __tcindex_destroy_fexts(f);
+ } else {
+ if (tcf_exts_get_net(&r->exts))
+ tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
+ else
+ __tcindex_destroy_rexts(r);
+ }
+
+ *last = false;
+ return 0;
+}
+
+static void tcindex_destroy_work(struct work_struct *work)
+{
+ struct tcindex_data *p = container_of(to_rcu_work(work),
+ struct tcindex_data,
+ rwork);
+
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+}
+
+static inline int
+valid_perfect_hash(struct tcindex_data *p)
+{
+ return p->hash > (p->mask >> p->shift);
+}
+
+static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
+ [TCA_TCINDEX_HASH] = { .type = NLA_U32 },
+ [TCA_TCINDEX_MASK] = { .type = NLA_U16 },
+ [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
+ [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
+ [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
+};
+
+static int tcindex_filter_result_init(struct tcindex_filter_result *r)
+{
+ memset(r, 0, sizeof(*r));
+ return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+}
+
+static void tcindex_partial_destroy_work(struct work_struct *work)
+{
+ struct tcindex_data *p = container_of(to_rcu_work(work),
+ struct tcindex_data,
+ rwork);
+
+ kfree(p->perfect);
+ kfree(p);
+}
+
+static void tcindex_free_perfect_hash(struct tcindex_data *cp)
+{
+ int i;
+
+ for (i = 0; i < cp->hash; i++)
+ tcf_exts_destroy(&cp->perfect[i].exts);
+ kfree(cp->perfect);
+}
+
+static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
+{
+ int i, err = 0;
+
+ cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!cp->perfect)
+ return -ENOMEM;
+
+ for (i = 0; i < cp->hash; i++) {
+ err = tcf_exts_init(&cp->perfect[i].exts,
+ TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ if (err < 0)
+ goto errout;
+#ifdef CONFIG_NET_CLS_ACT
+ cp->perfect[i].exts.net = net;
+#endif
+ }
+
+ return 0;
+
+errout:
+ tcindex_free_perfect_hash(cp);
+ return err;
+}
+
+static int
+tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct tcindex_data *p,
+ struct tcindex_filter_result *r, struct nlattr **tb,
+ struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
+{
+ struct tcindex_filter_result new_filter_result, *old_r = r;
+ struct tcindex_data *cp = NULL, *oldp;
+ struct tcindex_filter *f = NULL; /* make gcc behave */
+ struct tcf_result cr = {};
+ int err, balloc = 0;
+ struct tcf_exts e;
+
+ err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ if (err < 0)
+ return err;
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack);
+ if (err < 0)
+ goto errout;
+
+ err = -ENOMEM;
+ /* tcindex_data attributes must look atomic to classifier/lookup so
+ * allocate new tcindex data and RCU assign it onto root. Keeping
+ * perfect hash and hash pointers from old data.
+ */
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp)
+ goto errout;
+
+ cp->mask = p->mask;
+ cp->shift = p->shift;
+ cp->hash = p->hash;
+ cp->alloc_hash = p->alloc_hash;
+ cp->fall_through = p->fall_through;
+ cp->tp = tp;
+
+ if (tb[TCA_TCINDEX_HASH])
+ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+
+ if (tb[TCA_TCINDEX_MASK])
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+ if (tb[TCA_TCINDEX_SHIFT]) {
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+ if (cp->shift > 16) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+ if (!cp->hash) {
+ /* Hash not specified, use perfect hash if the upper limit
+ * of the hashing index is below the threshold.
+ */
+ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+ cp->hash = (cp->mask >> cp->shift) + 1;
+ else
+ cp->hash = DEFAULT_HASH_SIZE;
+ }
+
+ if (p->perfect) {
+ int i;
+
+ if (tcindex_alloc_perfect_hash(net, cp) < 0)
+ goto errout;
+ cp->alloc_hash = cp->hash;
+ for (i = 0; i < min(cp->hash, p->hash); i++)
+ cp->perfect[i].res = p->perfect[i].res;
+ balloc = 1;
+ }
+ cp->h = p->h;
+
+ err = tcindex_filter_result_init(&new_filter_result);
+ if (err < 0)
+ goto errout_alloc;
+ if (old_r)
+ cr = r->res;
+
+ err = -EBUSY;
+
+ /* Hash already allocated, make sure that we still meet the
+ * requirements for the allocated hash.
+ */
+ if (cp->perfect) {
+ if (!valid_perfect_hash(cp) ||
+ cp->hash > cp->alloc_hash)
+ goto errout_alloc;
+ } else if (cp->h && cp->hash != cp->alloc_hash) {
+ goto errout_alloc;
+ }
+
+ err = -EINVAL;
+ if (tb[TCA_TCINDEX_FALL_THROUGH])
+ cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
+
+ if (!cp->perfect && !cp->h)
+ cp->alloc_hash = cp->hash;
+
+ /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
+ * but then, we'd fail handles that may become valid after some future
+ * mask change. While this is extremely unlikely to ever matter,
+ * the check below is safer (and also more backwards-compatible).
+ */
+ if (cp->perfect || valid_perfect_hash(cp))
+ if (handle >= cp->alloc_hash)
+ goto errout_alloc;
+
+
+ err = -ENOMEM;
+ if (!cp->perfect && !cp->h) {
+ if (valid_perfect_hash(cp)) {
+ if (tcindex_alloc_perfect_hash(net, cp) < 0)
+ goto errout_alloc;
+ balloc = 1;
+ } else {
+ struct tcindex_filter __rcu **hash;
+
+ hash = kcalloc(cp->hash,
+ sizeof(struct tcindex_filter *),
+ GFP_KERNEL);
+
+ if (!hash)
+ goto errout_alloc;
+
+ cp->h = hash;
+ balloc = 2;
+ }
+ }
+
+ if (cp->perfect)
+ r = cp->perfect + handle;
+ else
+ r = tcindex_lookup(cp, handle) ? : &new_filter_result;
+
+ if (r == &new_filter_result) {
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ goto errout_alloc;
+ f->key = handle;
+ f->next = NULL;
+ err = tcindex_filter_result_init(&f->result);
+ if (err < 0) {
+ kfree(f);
+ goto errout_alloc;
+ }
+ }
+
+ if (tb[TCA_TCINDEX_CLASSID]) {
+ cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
+ tcf_bind_filter(tp, &cr, base);
+ }
+
+ if (old_r && old_r != r) {
+ err = tcindex_filter_result_init(old_r);
+ if (err < 0) {
+ kfree(f);
+ goto errout_alloc;
+ }
+ }
+
+ oldp = p;
+ r->res = cr;
+ tcf_exts_change(&r->exts, &e);
+
+ rcu_assign_pointer(tp->root, cp);
+
+ if (r == &new_filter_result) {
+ struct tcindex_filter *nfp;
+ struct tcindex_filter __rcu **fp;
+
+ f->result.res = r->res;
+ tcf_exts_change(&f->result.exts, &r->exts);
+
+ fp = cp->h + (handle % cp->hash);
+ for (nfp = rtnl_dereference(*fp);
+ nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp))
+ ; /* nothing */
+
+ rcu_assign_pointer(*fp, f);
+ } else {
+ tcf_exts_destroy(&new_filter_result.exts);
+ }
+
+ if (oldp)
+ tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work);
+ return 0;
+
+errout_alloc:
+ if (balloc == 1)
+ tcindex_free_perfect_hash(cp);
+ else if (balloc == 2)
+ kfree(cp->h);
+ tcf_exts_destroy(&new_filter_result.exts);
+errout:
+ kfree(cp);
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int
+tcindex_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, void **arg, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_TCINDEX_MAX + 1];
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = *arg;
+ int err;
+
+ pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
+ "p %p,r %p,*arg %p\n",
+ tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL);
+
+ if (!opt)
+ return 0;
+
+ err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL);
+ if (err < 0)
+ return err;
+
+ return tcindex_set_parms(net, tp, base, handle, p, r, tb,
+ tca[TCA_RATE], ovr, extack);
+}
+
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter *f, *next;
+ int i;
+
+ pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
+ if (p->perfect) {
+ for (i = 0; i < p->hash; i++) {
+ if (!p->perfect[i].res.class)
+ continue;
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp, p->perfect + i, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+ if (!p->h)
+ return;
+ for (i = 0; i < p->hash; i++) {
+ for (f = rtnl_dereference(p->h[i]); f; f = next) {
+ next = rtnl_dereference(f->next);
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp, &f->result, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+}
+
+static void tcindex_destroy(struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ int i;
+
+ pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
+
+ if (p->perfect) {
+ for (i = 0; i < p->hash; i++) {
+ struct tcindex_filter_result *r = p->perfect + i;
+
+ tcf_unbind_filter(tp, &r->res);
+ if (tcf_exts_get_net(&r->exts))
+ tcf_queue_work(&r->rwork,
+ tcindex_destroy_rexts_work);
+ else
+ __tcindex_destroy_rexts(r);
+ }
+ }
+
+ for (i = 0; p->h && i < p->hash; i++) {
+ struct tcindex_filter *f, *next;
+ bool last;
+
+ for (f = rtnl_dereference(p->h[i]); f; f = next) {
+ next = rtnl_dereference(f->next);
+ tcindex_delete(tp, &f->result, &last, NULL);
+ }
+ }
+
+ tcf_queue_work(&p->rwork, tcindex_destroy_work);
+}
+
+
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = fh;
+ struct nlattr *nest;
+
+ pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
+ tp, fh, skb, t, p, r);
+ pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (!fh) {
+ t->tcm_handle = ~0; /* whatever ... */
+ if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
+ nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
+ nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
+ nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ } else {
+ if (p->perfect) {
+ t->tcm_handle = r - p->perfect;
+ } else {
+ struct tcindex_filter *f;
+ struct tcindex_filter __rcu **fp;
+ int i;
+
+ t->tcm_handle = 0;
+ for (i = 0; !t->tcm_handle && i < p->hash; i++) {
+ fp = &p->h[i];
+ for (f = rtnl_dereference(*fp);
+ !t->tcm_handle && f;
+ fp = &f->next, f = rtnl_dereference(*fp)) {
+ if (&f->result == r)
+ t->tcm_handle = f->key;
+ }
+ }
+ }
+ pr_debug("handle = %d\n", t->tcm_handle);
+ if (r->res.class &&
+ nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &r->exts) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &r->exts) < 0)
+ goto nla_put_failure;
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
+{
+ struct tcindex_filter_result *r = fh;
+
+ if (r && r->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &r->res, base);
+ else
+ __tcf_unbind_filter(q, &r->res);
+ }
+}
+
+static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
+ .kind = "tcindex",
+ .classify = tcindex_classify,
+ .init = tcindex_init,
+ .destroy = tcindex_destroy,
+ .get = tcindex_get,
+ .change = tcindex_change,
+ .delete = tcindex_delete,
+ .walk = tcindex_walk,
+ .dump = tcindex_dump,
+ .bind_class = tcindex_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_tcindex(void)
+{
+ return register_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+static void __exit exit_tcindex(void)
+{
+ unregister_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+module_init(init_tcindex)
+module_exit(exit_tcindex)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
new file mode 100644
index 000000000..d30256ac3
--- /dev/null
+++ b/net/sched/cls_u32.c
@@ -0,0 +1,1507 @@
+/*
+ * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * The filters are packed to hash tables of key nodes
+ * with a set of 32bit key/mask pairs at every node.
+ * Nodes reference next level hash tables etc.
+ *
+ * This scheme is the best universal classifier I managed to
+ * invent; it is not super-fast, but it is not slow (provided you
+ * program it correctly), and general enough. And its relative
+ * speed grows as the number of rules becomes larger.
+ *
+ * It seems that it represents the best middle point between
+ * speed and manageability both by human and by machine.
+ *
+ * It is especially useful for link sharing combined with QoS;
+ * pure RSVP doesn't need such a general approach and can use
+ * much simpler (and faster) schemes, sort of cls_rsvp.c.
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/bitmap.h>
+#include <linux/netdevice.h>
+#include <linux/hash.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <linux/idr.h>
+
+struct tc_u_knode {
+ struct tc_u_knode __rcu *next;
+ u32 handle;
+ struct tc_u_hnode __rcu *ht_up;
+ struct tcf_exts exts;
+#ifdef CONFIG_NET_CLS_IND
+ int ifindex;
+#endif
+ u8 fshift;
+ struct tcf_result res;
+ struct tc_u_hnode __rcu *ht_down;
+#ifdef CONFIG_CLS_U32_PERF
+ struct tc_u32_pcnt __percpu *pf;
+#endif
+ u32 flags;
+ unsigned int in_hw_count;
+#ifdef CONFIG_CLS_U32_MARK
+ u32 val;
+ u32 mask;
+ u32 __percpu *pcpu_success;
+#endif
+ struct tcf_proto *tp;
+ struct rcu_work rwork;
+ /* The 'sel' field MUST be the last field in structure to allow for
+ * tc_u32_keys allocated at end of structure.
+ */
+ struct tc_u32_sel sel;
+};
+
+struct tc_u_hnode {
+ struct tc_u_hnode __rcu *next;
+ u32 handle;
+ u32 prio;
+ struct tc_u_common *tp_c;
+ int refcnt;
+ unsigned int divisor;
+ struct idr handle_idr;
+ struct rcu_head rcu;
+ u32 flags;
+ /* The 'ht' field MUST be the last field in structure to allow for
+ * more entries allocated at end of structure.
+ */
+ struct tc_u_knode __rcu *ht[1];
+};
+
+struct tc_u_common {
+ struct tc_u_hnode __rcu *hlist;
+ void *ptr;
+ int refcnt;
+ struct idr handle_idr;
+ struct hlist_node hnode;
+ struct rcu_head rcu;
+};
+
+static inline unsigned int u32_hash_fold(__be32 key,
+ const struct tc_u32_sel *sel,
+ u8 fshift)
+{
+ unsigned int h = ntohl(key & sel->hmask) >> fshift;
+
+ return h;
+}
+
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct {
+ struct tc_u_knode *knode;
+ unsigned int off;
+ } stack[TC_U32_MAXDEPTH];
+
+ struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
+ unsigned int off = skb_network_offset(skb);
+ struct tc_u_knode *n;
+ int sdepth = 0;
+ int off2 = 0;
+ int sel = 0;
+#ifdef CONFIG_CLS_U32_PERF
+ int j;
+#endif
+ int i, r;
+
+next_ht:
+ n = rcu_dereference_bh(ht->ht[sel]);
+
+next_knode:
+ if (n) {
+ struct tc_u32_key *key = n->sel.keys;
+
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->rcnt);
+ j = 0;
+#endif
+
+ if (tc_skip_sw(n->flags)) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
+#ifdef CONFIG_CLS_U32_MARK
+ if ((skb->mark & n->mask) != n->val) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ } else {
+ __this_cpu_inc(*n->pcpu_success);
+ }
+#endif
+
+ for (i = n->sel.nkeys; i > 0; i--, key++) {
+ int toff = off + key->off + (off2 & key->offmask);
+ __be32 *data, hdata;
+
+ if (skb_headroom(skb) + toff > INT_MAX)
+ goto out;
+
+ data = skb_header_pointer(skb, toff, 4, &hdata);
+ if (!data)
+ goto out;
+ if ((*data ^ key->val) & key->mask) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->kcnts[j]);
+ j++;
+#endif
+ }
+
+ ht = rcu_dereference_bh(n->ht_down);
+ if (!ht) {
+check_terminal:
+ if (n->sel.flags & TC_U32_TERMINAL) {
+
+ *res = n->res;
+#ifdef CONFIG_NET_CLS_IND
+ if (!tcf_match_indev(skb, n->ifindex)) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->rhit);
+#endif
+ r = tcf_exts_exec(skb, &n->exts, res);
+ if (r < 0) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
+ return r;
+ }
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
+ /* PUSH */
+ if (sdepth >= TC_U32_MAXDEPTH)
+ goto deadloop;
+ stack[sdepth].knode = n;
+ stack[sdepth].off = off;
+ sdepth++;
+
+ ht = rcu_dereference_bh(n->ht_down);
+ sel = 0;
+ if (ht->divisor) {
+ __be32 *data, hdata;
+
+ data = skb_header_pointer(skb, off + n->sel.hoff, 4,
+ &hdata);
+ if (!data)
+ goto out;
+ sel = ht->divisor & u32_hash_fold(*data, &n->sel,
+ n->fshift);
+ }
+ if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
+ goto next_ht;
+
+ if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
+ off2 = n->sel.off + 3;
+ if (n->sel.flags & TC_U32_VAROFFSET) {
+ __be16 *data, hdata;
+
+ data = skb_header_pointer(skb,
+ off + n->sel.offoff,
+ 2, &hdata);
+ if (!data)
+ goto out;
+ off2 += ntohs(n->sel.offmask & *data) >>
+ n->sel.offshift;
+ }
+ off2 &= ~3;
+ }
+ if (n->sel.flags & TC_U32_EAT) {
+ off += off2;
+ off2 = 0;
+ }
+
+ if (off < skb->len)
+ goto next_ht;
+ }
+
+ /* POP */
+ if (sdepth--) {
+ n = stack[sdepth].knode;
+ ht = rcu_dereference_bh(n->ht_up);
+ off = stack[sdepth].off;
+ goto check_terminal;
+ }
+out:
+ return -1;
+
+deadloop:
+ net_warn_ratelimited("cls_u32: dead loop\n");
+ return -1;
+}
+
+static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+{
+ struct tc_u_hnode *ht;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next))
+ if (ht->handle == handle)
+ break;
+
+ return ht;
+}
+
+static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+{
+ unsigned int sel;
+ struct tc_u_knode *n = NULL;
+
+ sel = TC_U32_HASH(handle);
+ if (sel > ht->divisor)
+ goto out;
+
+ for (n = rtnl_dereference(ht->ht[sel]);
+ n;
+ n = rtnl_dereference(n->next))
+ if (n->handle == handle)
+ break;
+out:
+ return n;
+}
+
+
+static void *u32_get(struct tcf_proto *tp, u32 handle)
+{
+ struct tc_u_hnode *ht;
+ struct tc_u_common *tp_c = tp->data;
+
+ if (TC_U32_HTID(handle) == TC_U32_ROOT)
+ ht = rtnl_dereference(tp->root);
+ else
+ ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
+
+ if (!ht)
+ return NULL;
+
+ if (TC_U32_KEY(handle) == 0)
+ return ht;
+
+ return u32_lookup_key(ht, handle);
+}
+
+/* Protected by rtnl lock */
+static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
+{
+ int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL);
+ if (id < 0)
+ return 0;
+ return (id | 0x800U) << 20;
+}
+
+static struct hlist_head *tc_u_common_hash;
+
+#define U32_HASH_SHIFT 10
+#define U32_HASH_SIZE (1 << U32_HASH_SHIFT)
+
+static void *tc_u_common_ptr(const struct tcf_proto *tp)
+{
+ struct tcf_block *block = tp->chain->block;
+
+ /* The block sharing is currently supported only
+ * for classless qdiscs. In that case we use block
+ * for tc_u_common identification. In case the
+ * block is not shared, block->q is a valid pointer
+ * and we can use that. That works for classful qdiscs.
+ */
+ if (tcf_block_shared(block))
+ return block;
+ else
+ return block->q;
+}
+
+static unsigned int tc_u_hash(const struct tcf_proto *tp)
+{
+ return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT);
+}
+
+static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+{
+ struct tc_u_common *tc;
+ unsigned int h;
+
+ h = tc_u_hash(tp);
+ hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
+ if (tc->ptr == tc_u_common_ptr(tp))
+ return tc;
+ }
+ return NULL;
+}
+
+static int u32_init(struct tcf_proto *tp)
+{
+ struct tc_u_hnode *root_ht;
+ struct tc_u_common *tp_c;
+ unsigned int h;
+
+ tp_c = tc_u_common_find(tp);
+
+ root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+ if (root_ht == NULL)
+ return -ENOBUFS;
+
+ root_ht->refcnt++;
+ root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
+ root_ht->prio = tp->prio;
+ idr_init(&root_ht->handle_idr);
+
+ if (tp_c == NULL) {
+ tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
+ if (tp_c == NULL) {
+ kfree(root_ht);
+ return -ENOBUFS;
+ }
+ tp_c->ptr = tc_u_common_ptr(tp);
+ INIT_HLIST_NODE(&tp_c->hnode);
+ idr_init(&tp_c->handle_idr);
+
+ h = tc_u_hash(tp);
+ hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
+ }
+
+ tp_c->refcnt++;
+ RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, root_ht);
+ root_ht->tp_c = tp_c;
+
+ root_ht->refcnt++;
+ rcu_assign_pointer(tp->root, root_ht);
+ tp->data = tp_c;
+ return 0;
+}
+
+static void __u32_destroy_key(struct tc_u_knode *n)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+
+ tcf_exts_destroy(&n->exts);
+ if (ht && --ht->refcnt == 0)
+ kfree(ht);
+ kfree(n);
+}
+
+static void u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
+ bool free_pf)
+{
+ tcf_exts_put_net(&n->exts);
+#ifdef CONFIG_CLS_U32_PERF
+ if (free_pf)
+ free_percpu(n->pf);
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+ if (free_pf)
+ free_percpu(n->pcpu_success);
+#endif
+ __u32_destroy_key(n);
+}
+
+/* u32_delete_key_rcu should be called when free'ing a copied
+ * version of a tc_u_knode obtained from u32_init_knode(). When
+ * copies are obtained from u32_init_knode() the statistics are
+ * shared between the old and new copies to allow readers to
+ * continue to update the statistics during the copy. To support
+ * this the u32_delete_key_rcu variant does not free the percpu
+ * statistics.
+ */
+static void u32_delete_key_work(struct work_struct *work)
+{
+ struct tc_u_knode *key = container_of(to_rcu_work(work),
+ struct tc_u_knode,
+ rwork);
+ rtnl_lock();
+ u32_destroy_key(key->tp, key, false);
+ rtnl_unlock();
+}
+
+/* u32_delete_key_freepf_rcu is the rcu callback variant
+ * that free's the entire structure including the statistics
+ * percpu variables. Only use this if the key is not a copy
+ * returned by u32_init_knode(). See u32_delete_key_rcu()
+ * for the variant that should be used with keys return from
+ * u32_init_knode()
+ */
+static void u32_delete_key_freepf_work(struct work_struct *work)
+{
+ struct tc_u_knode *key = container_of(to_rcu_work(work),
+ struct tc_u_knode,
+ rwork);
+ rtnl_lock();
+ u32_destroy_key(key->tp, key, true);
+ rtnl_unlock();
+}
+
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
+{
+ struct tc_u_knode __rcu **kp;
+ struct tc_u_knode *pkp;
+ struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
+
+ if (ht) {
+ kp = &ht->ht[TC_U32_HASH(key->handle)];
+ for (pkp = rtnl_dereference(*kp); pkp;
+ kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
+ if (pkp == key) {
+ RCU_INIT_POINTER(*kp, key->next);
+
+ tcf_unbind_filter(tp, &key->res);
+ idr_remove(&ht->handle_idr, key->handle);
+ tcf_exts_get_net(&key->exts);
+ tcf_queue_work(&key->rwork, u32_delete_key_freepf_work);
+ return 0;
+ }
+ }
+ }
+ WARN_ON(1);
+ return 0;
+}
+
+static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack);
+ cls_u32.command = TC_CLSU32_DELETE_HNODE;
+ cls_u32.hnode.divisor = h->divisor;
+ cls_u32.hnode.handle = h->handle;
+ cls_u32.hnode.prio = h->prio;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+}
+
+static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+ u32 flags, struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+ bool skip_sw = tc_skip_sw(flags);
+ bool offloaded = false;
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack);
+ cls_u32.command = TC_CLSU32_NEW_HNODE;
+ cls_u32.hnode.divisor = h->divisor;
+ cls_u32.hnode.handle = h->handle;
+ cls_u32.hnode.prio = h->prio;
+
+ err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ if (err < 0) {
+ u32_clear_hw_hnode(tp, h, NULL);
+ return err;
+ } else if (err > 0) {
+ offloaded = true;
+ }
+
+ if (skip_sw && !offloaded)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
+ cls_u32.command = TC_CLSU32_DELETE_KNODE;
+ cls_u32.knode.handle = n->handle;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+ tcf_block_offload_dec(block, &n->flags);
+}
+
+static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ u32 flags, struct netlink_ext_ack *extack)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+ bool skip_sw = tc_skip_sw(flags);
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack);
+ cls_u32.command = TC_CLSU32_REPLACE_KNODE;
+ cls_u32.knode.handle = n->handle;
+ cls_u32.knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+ cls_u32.knode.val = n->val;
+ cls_u32.knode.mask = n->mask;
+#else
+ cls_u32.knode.val = 0;
+ cls_u32.knode.mask = 0;
+#endif
+ cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.exts = &n->exts;
+ if (n->ht_down)
+ cls_u32.knode.link_handle = ht->handle;
+
+ err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ if (err < 0) {
+ u32_remove_hw_knode(tp, n, NULL);
+ return err;
+ } else if (err > 0) {
+ n->in_hw_count = err;
+ tcf_block_offload_inc(block, &n->flags);
+ }
+
+ if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_knode *n;
+ unsigned int h;
+
+ for (h = 0; h <= ht->divisor; h++) {
+ while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
+ RCU_INIT_POINTER(ht->ht[h],
+ rtnl_dereference(n->next));
+ tcf_unbind_filter(tp, &n->res);
+ u32_remove_hw_knode(tp, n, extack);
+ idr_remove(&ht->handle_idr, n->handle);
+ if (tcf_exts_get_net(&n->exts))
+ tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
+ else
+ u32_destroy_key(n->tp, n, true);
+ }
+ }
+}
+
+static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode __rcu **hn;
+ struct tc_u_hnode *phn;
+
+ WARN_ON(--ht->refcnt);
+
+ u32_clear_hnode(tp, ht, extack);
+
+ hn = &tp_c->hlist;
+ for (phn = rtnl_dereference(*hn);
+ phn;
+ hn = &phn->next, phn = rtnl_dereference(*hn)) {
+ if (phn == ht) {
+ u32_clear_hw_hnode(tp, ht, extack);
+ idr_destroy(&ht->handle_idr);
+ idr_remove(&tp_c->handle_idr, ht->handle);
+ RCU_INIT_POINTER(*hn, ht->next);
+ kfree_rcu(ht, rcu);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static bool ht_empty(struct tc_u_hnode *ht)
+{
+ unsigned int h;
+
+ for (h = 0; h <= ht->divisor; h++)
+ if (rcu_access_pointer(ht->ht[h]))
+ return false;
+
+ return true;
+}
+
+static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
+
+ WARN_ON(root_ht == NULL);
+
+ if (root_ht && --root_ht->refcnt == 1)
+ u32_destroy_hnode(tp, root_ht, extack);
+
+ if (--tp_c->refcnt == 0) {
+ struct tc_u_hnode *ht;
+
+ hlist_del(&tp_c->hnode);
+
+ while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
+ u32_clear_hnode(tp, ht, extack);
+ RCU_INIT_POINTER(tp_c->hlist, ht->next);
+
+ /* u32_destroy_key() will later free ht for us, if it's
+ * still referenced by some knode
+ */
+ if (--ht->refcnt == 0)
+ kfree_rcu(ht, rcu);
+ }
+
+ idr_destroy(&tp_c->handle_idr);
+ kfree(tp_c);
+ }
+
+ tp->data = NULL;
+}
+
+static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_hnode *ht = arg;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
+ struct tc_u_common *tp_c = tp->data;
+ int ret = 0;
+
+ if (ht == NULL)
+ goto out;
+
+ if (TC_U32_KEY(ht->handle)) {
+ u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
+ ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
+ goto out;
+ }
+
+ if (root_ht == ht) {
+ NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
+ return -EINVAL;
+ }
+
+ if (ht->refcnt == 1) {
+ u32_destroy_hnode(tp, ht, extack);
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
+ return -EBUSY;
+ }
+
+out:
+ *last = true;
+ if (root_ht) {
+ if (root_ht->refcnt > 2) {
+ *last = false;
+ goto ret;
+ }
+ if (root_ht->refcnt == 2) {
+ if (!ht_empty(root_ht)) {
+ *last = false;
+ goto ret;
+ }
+ }
+ }
+
+ if (tp_c->refcnt > 1) {
+ *last = false;
+ goto ret;
+ }
+
+ if (tp_c->refcnt == 1) {
+ struct tc_u_hnode *ht;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next))
+ if (!ht_empty(ht)) {
+ *last = false;
+ break;
+ }
+ }
+
+ret:
+ return ret;
+}
+
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid)
+{
+ u32 index = htid | 0x800;
+ u32 max = htid | 0xFFF;
+
+ if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max, GFP_KERNEL)) {
+ index = htid + 1;
+ if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max,
+ GFP_KERNEL))
+ index = max;
+ }
+
+ return index;
+}
+
+static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
+ [TCA_U32_CLASSID] = { .type = NLA_U32 },
+ [TCA_U32_HASH] = { .type = NLA_U32 },
+ [TCA_U32_LINK] = { .type = NLA_U32 },
+ [TCA_U32_DIVISOR] = { .type = NLA_U32 },
+ [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) },
+ [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) },
+ [TCA_U32_FLAGS] = { .type = NLA_U32 },
+};
+
+static int u32_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct tc_u_hnode *ht,
+ struct tc_u_knode *n, struct nlattr **tb,
+ struct nlattr *est, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_U32_LINK]) {
+ u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
+ struct tc_u_hnode *ht_down = NULL, *ht_old;
+
+ if (TC_U32_KEY(handle)) {
+ NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table");
+ return -EINVAL;
+ }
+
+ if (handle) {
+ ht_down = u32_lookup_ht(ht->tp_c, handle);
+
+ if (!ht_down) {
+ NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
+ return -EINVAL;
+ }
+ ht_down->refcnt++;
+ }
+
+ ht_old = rtnl_dereference(n->ht_down);
+ rcu_assign_pointer(n->ht_down, ht_down);
+
+ if (ht_old)
+ ht_old->refcnt--;
+ }
+ if (tb[TCA_U32_CLASSID]) {
+ n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
+ tcf_bind_filter(tp, &n->res, base);
+ }
+
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_U32_INDEV]) {
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
+ if (ret < 0)
+ return -EINVAL;
+ n->ifindex = ret;
+ }
+#endif
+ return 0;
+}
+
+static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
+ struct tc_u_knode *n)
+{
+ struct tc_u_knode __rcu **ins;
+ struct tc_u_knode *pins;
+ struct tc_u_hnode *ht;
+
+ if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
+ ht = rtnl_dereference(tp->root);
+ else
+ ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));
+
+ ins = &ht->ht[TC_U32_HASH(n->handle)];
+
+ /* The node must always exist for it to be replaced if this is not the
+ * case then something went very wrong elsewhere.
+ */
+ for (pins = rtnl_dereference(*ins); ;
+ ins = &pins->next, pins = rtnl_dereference(*ins))
+ if (pins->handle == n->handle)
+ break;
+
+ idr_replace(&ht->handle_idr, n, n->handle);
+ RCU_INIT_POINTER(n->next, pins->next);
+ rcu_assign_pointer(*ins, n);
+}
+
+static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
+ struct tc_u_knode *n)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+ struct tc_u32_sel *s = &n->sel;
+ struct tc_u_knode *new;
+
+ new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
+ GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ RCU_INIT_POINTER(new->next, n->next);
+ new->handle = n->handle;
+ RCU_INIT_POINTER(new->ht_up, n->ht_up);
+
+#ifdef CONFIG_NET_CLS_IND
+ new->ifindex = n->ifindex;
+#endif
+ new->fshift = n->fshift;
+ new->res = n->res;
+ new->flags = n->flags;
+ RCU_INIT_POINTER(new->ht_down, ht);
+
+#ifdef CONFIG_CLS_U32_PERF
+ /* Statistics may be incremented by readers during update
+ * so we must keep them in tact. When the node is later destroyed
+ * a special destroy call must be made to not free the pf memory.
+ */
+ new->pf = n->pf;
+#endif
+
+#ifdef CONFIG_CLS_U32_MARK
+ new->val = n->val;
+ new->mask = n->mask;
+ /* Similarly success statistics must be moved as pointers */
+ new->pcpu_success = n->pcpu_success;
+#endif
+ new->tp = tp;
+ memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+
+ if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
+ kfree(new);
+ return NULL;
+ }
+
+ /* bump reference count as long as we hold pointer to structure */
+ if (ht)
+ ht->refcnt++;
+
+ return new;
+}
+
+static int u32_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, void **arg, bool ovr,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ struct tc_u32_sel *s;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_U32_MAX + 1];
+ u32 htid, flags = 0;
+ size_t sel_size;
+ int err;
+#ifdef CONFIG_CLS_U32_PERF
+ size_t size;
+#endif
+
+ if (!opt) {
+ if (handle) {
+ NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options");
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+ }
+
+ err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_U32_FLAGS]) {
+ flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+ if (!tc_flags_valid(flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags");
+ return -EINVAL;
+ }
+ }
+
+ n = *arg;
+ if (n) {
+ struct tc_u_knode *new;
+
+ if (TC_U32_KEY(n->handle) == 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero");
+ return -EINVAL;
+ }
+
+ if ((n->flags ^ flags) &
+ ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
+ NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
+ return -EINVAL;
+ }
+
+ new = u32_init_knode(tp, n);
+ if (!new)
+ return -ENOMEM;
+
+ err = u32_set_parms(net, tp, base,
+ rtnl_dereference(n->ht_up), new, tb,
+ tca[TCA_RATE], ovr, extack);
+
+ if (err) {
+ __u32_destroy_key(new);
+ return err;
+ }
+
+ err = u32_replace_hw_knode(tp, new, flags, extack);
+ if (err) {
+ __u32_destroy_key(new);
+ return err;
+ }
+
+ if (!tc_in_hw(new->flags))
+ new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
+ u32_replace_knode(tp, tp_c, new);
+ tcf_unbind_filter(tp, &n->res);
+ tcf_exts_get_net(&n->exts);
+ tcf_queue_work(&n->rwork, u32_delete_key_work);
+ return 0;
+ }
+
+ if (tb[TCA_U32_DIVISOR]) {
+ unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+
+ if (--divisor > 0x100) {
+ NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
+ return -EINVAL;
+ }
+ if (TC_U32_KEY(handle)) {
+ NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table");
+ return -EINVAL;
+ }
+ ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+ if (ht == NULL)
+ return -ENOBUFS;
+ if (handle == 0) {
+ handle = gen_new_htid(tp->data, ht);
+ if (handle == 0) {
+ kfree(ht);
+ return -ENOMEM;
+ }
+ } else {
+ err = idr_alloc_u32(&tp_c->handle_idr, ht, &handle,
+ handle, GFP_KERNEL);
+ if (err) {
+ kfree(ht);
+ return err;
+ }
+ }
+ ht->tp_c = tp_c;
+ ht->refcnt = 1;
+ ht->divisor = divisor;
+ ht->handle = handle;
+ ht->prio = tp->prio;
+ idr_init(&ht->handle_idr);
+ ht->flags = flags;
+
+ err = u32_replace_hw_hnode(tp, ht, flags, extack);
+ if (err) {
+ idr_remove(&tp_c->handle_idr, handle);
+ kfree(ht);
+ return err;
+ }
+
+ RCU_INIT_POINTER(ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, ht);
+ *arg = ht;
+
+ return 0;
+ }
+
+ if (tb[TCA_U32_HASH]) {
+ htid = nla_get_u32(tb[TCA_U32_HASH]);
+ if (TC_U32_HTID(htid) == TC_U32_ROOT) {
+ ht = rtnl_dereference(tp->root);
+ htid = ht->handle;
+ } else {
+ ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
+ if (!ht) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found");
+ return -EINVAL;
+ }
+ }
+ } else {
+ ht = rtnl_dereference(tp->root);
+ htid = ht->handle;
+ }
+
+ if (ht->divisor < TC_U32_HASH(htid)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value");
+ return -EINVAL;
+ }
+
+ if (handle) {
+ if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) {
+ NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch");
+ return -EINVAL;
+ }
+ handle = htid | TC_U32_NODE(handle);
+ err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle,
+ GFP_KERNEL);
+ if (err)
+ return err;
+ } else
+ handle = gen_new_kid(ht, htid);
+
+ if (tb[TCA_U32_SEL] == NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "Selector not specified");
+ err = -EINVAL;
+ goto erridr;
+ }
+
+ s = nla_data(tb[TCA_U32_SEL]);
+ sel_size = struct_size(s, keys, s->nkeys);
+ if (nla_len(tb[TCA_U32_SEL]) < sel_size) {
+ err = -EINVAL;
+ goto erridr;
+ }
+
+ n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
+ if (n == NULL) {
+ err = -ENOBUFS;
+ goto erridr;
+ }
+
+#ifdef CONFIG_CLS_U32_PERF
+ size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
+ n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
+ if (!n->pf) {
+ err = -ENOBUFS;
+ goto errfree;
+ }
+#endif
+
+ memcpy(&n->sel, s, sel_size);
+ RCU_INIT_POINTER(n->ht_up, ht);
+ n->handle = handle;
+ n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+ n->flags = flags;
+ n->tp = tp;
+
+ err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ if (err < 0)
+ goto errout;
+
+#ifdef CONFIG_CLS_U32_MARK
+ n->pcpu_success = alloc_percpu(u32);
+ if (!n->pcpu_success) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ if (tb[TCA_U32_MARK]) {
+ struct tc_u32_mark *mark;
+
+ mark = nla_data(tb[TCA_U32_MARK]);
+ n->val = mark->val;
+ n->mask = mark->mask;
+ }
+#endif
+
+ err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr,
+ extack);
+ if (err == 0) {
+ struct tc_u_knode __rcu **ins;
+ struct tc_u_knode *pins;
+
+ err = u32_replace_hw_knode(tp, n, flags, extack);
+ if (err)
+ goto errhw;
+
+ if (!tc_in_hw(n->flags))
+ n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
+ ins = &ht->ht[TC_U32_HASH(handle)];
+ for (pins = rtnl_dereference(*ins); pins;
+ ins = &pins->next, pins = rtnl_dereference(*ins))
+ if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
+ break;
+
+ RCU_INIT_POINTER(n->next, pins);
+ rcu_assign_pointer(*ins, n);
+ *arg = n;
+ return 0;
+ }
+
+errhw:
+#ifdef CONFIG_CLS_U32_MARK
+ free_percpu(n->pcpu_success);
+#endif
+
+errout:
+ tcf_exts_destroy(&n->exts);
+#ifdef CONFIG_CLS_U32_PERF
+errfree:
+ free_percpu(n->pf);
+#endif
+ kfree(n);
+erridr:
+ idr_remove(&ht->handle_idr, handle);
+ return err;
+}
+
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ unsigned int h;
+
+ if (arg->stop)
+ return;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ if (ht->prio != tp->prio)
+ continue;
+ if (arg->count >= arg->skip) {
+ if (arg->fn(tp, ht, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ }
+ arg->count++;
+ for (h = 0; h <= ht->divisor; h++) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, n, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+}
+
+static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
+ cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
+ cls_u32.hnode.divisor = ht->divisor;
+ cls_u32.hnode.handle = ht->handle;
+ cls_u32.hnode.prio = ht->prio;
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err && add && tc_skip_sw(ht->flags))
+ return err;
+
+ return 0;
+}
+
+static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
+ cls_u32.command = add ?
+ TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE;
+ cls_u32.knode.handle = n->handle;
+
+ if (add) {
+ cls_u32.knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+ cls_u32.knode.val = n->val;
+ cls_u32.knode.mask = n->mask;
+#else
+ cls_u32.knode.val = 0;
+ cls_u32.knode.mask = 0;
+#endif
+ cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.exts = &n->exts;
+ if (n->ht_down)
+ cls_u32.knode.link_handle = ht->handle;
+ }
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(n->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+
+ return 0;
+}
+
+static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ unsigned int h;
+ int err;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ if (ht->prio != tp->prio)
+ continue;
+
+ /* When adding filters to a new dev, try to offload the
+ * hashtable first. When removing, do the filters before the
+ * hashtable.
+ */
+ if (add && !tc_skip_hw(ht->flags)) {
+ err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv,
+ extack);
+ if (err)
+ return err;
+ }
+
+ for (h = 0; h <= ht->divisor; h++) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
+ if (tc_skip_hw(n->flags))
+ continue;
+
+ err = u32_reoffload_knode(tp, n, add, cb,
+ cb_priv, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!add && !tc_skip_hw(ht->flags))
+ u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack);
+ }
+
+ return 0;
+}
+
+static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
+{
+ struct tc_u_knode *n = fh;
+
+ if (n && n->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &n->res, base);
+ else
+ __tcf_unbind_filter(q, &n->res);
+ }
+}
+
+static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct tc_u_knode *n = fh;
+ struct tc_u_hnode *ht_up, *ht_down;
+ struct nlattr *nest;
+
+ if (n == NULL)
+ return skb->len;
+
+ t->tcm_handle = n->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (TC_U32_KEY(n->handle) == 0) {
+ struct tc_u_hnode *ht = fh;
+ u32 divisor = ht->divisor + 1;
+
+ if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
+ goto nla_put_failure;
+ } else {
+#ifdef CONFIG_CLS_U32_PERF
+ struct tc_u32_pcnt *gpf;
+ int cpu;
+#endif
+
+ if (nla_put(skb, TCA_U32_SEL,
+ sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+ &n->sel))
+ goto nla_put_failure;
+
+ ht_up = rtnl_dereference(n->ht_up);
+ if (ht_up) {
+ u32 htid = n->handle & 0xFFFFF000;
+ if (nla_put_u32(skb, TCA_U32_HASH, htid))
+ goto nla_put_failure;
+ }
+ if (n->res.classid &&
+ nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
+ goto nla_put_failure;
+
+ ht_down = rtnl_dereference(n->ht_down);
+ if (ht_down &&
+ nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
+ goto nla_put_failure;
+
+ if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+ goto nla_put_failure;
+
+#ifdef CONFIG_CLS_U32_MARK
+ if ((n->val || n->mask)) {
+ struct tc_u32_mark mark = {.val = n->val,
+ .mask = n->mask,
+ .success = 0};
+ int cpum;
+
+ for_each_possible_cpu(cpum) {
+ __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
+
+ mark.success += cnt;
+ }
+
+ if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
+ goto nla_put_failure;
+ }
+#endif
+
+ if (tcf_exts_dump(skb, &n->exts) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NET_CLS_IND
+ if (n->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, n->ifindex);
+ if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+ gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
+ n->sel.nkeys * sizeof(u64),
+ GFP_KERNEL);
+ if (!gpf)
+ goto nla_put_failure;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
+
+ gpf->rcnt += pf->rcnt;
+ gpf->rhit += pf->rhit;
+ for (i = 0; i < n->sel.nkeys; i++)
+ gpf->kcnts[i] += pf->kcnts[i];
+ }
+
+ if (nla_put_64bit(skb, TCA_U32_PCNT,
+ sizeof(struct tc_u32_pcnt) +
+ n->sel.nkeys * sizeof(u64),
+ gpf, TCA_U32_PAD)) {
+ kfree(gpf);
+ goto nla_put_failure;
+ }
+ kfree(gpf);
+#endif
+ }
+
+ nla_nest_end(skb, nest);
+
+ if (TC_U32_KEY(n->handle))
+ if (tcf_exts_dump_stats(skb, &n->exts) < 0)
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_u32_ops __read_mostly = {
+ .kind = "u32",
+ .classify = u32_classify,
+ .init = u32_init,
+ .destroy = u32_destroy,
+ .get = u32_get,
+ .change = u32_change,
+ .delete = u32_delete,
+ .walk = u32_walk,
+ .reoffload = u32_reoffload,
+ .dump = u32_dump,
+ .bind_class = u32_bind_class,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_u32(void)
+{
+ int i, ret;
+
+ pr_info("u32 classifier\n");
+#ifdef CONFIG_CLS_U32_PERF
+ pr_info(" Performance counters on\n");
+#endif
+#ifdef CONFIG_NET_CLS_IND
+ pr_info(" input device check on\n");
+#endif
+#ifdef CONFIG_NET_CLS_ACT
+ pr_info(" Actions configured\n");
+#endif
+ tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!tc_u_common_hash)
+ return -ENOMEM;
+
+ for (i = 0; i < U32_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&tc_u_common_hash[i]);
+
+ ret = register_tcf_proto_ops(&cls_u32_ops);
+ if (ret)
+ kvfree(tc_u_common_hash);
+ return ret;
+}
+
+static void __exit exit_u32(void)
+{
+ unregister_tcf_proto_ops(&cls_u32_ops);
+ kvfree(tc_u_common_hash);
+}
+
+module_init(init_u32)
+module_exit(exit_u32)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 000000000..ddd883ca5
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,233 @@
+/*
+ * em_canid.c Ematch rule to match CAN frames according to their CAN IDs
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright: (c) 2011 Czech Technical University in Prague
+ * (c) 2011 Volkswagen Group Research
+ * Authors: Michal Sojka <sojkam1@fel.cvut.cz>
+ * Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ * Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by: Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+ /* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+ int rules_count;
+ int sff_rules_count;
+ int eff_rules_count;
+
+ /*
+ * Raw rules copied from netlink message; Used for sending
+ * information to userspace (when 'tc filter show' is invoked)
+ * AND when matching EFF frames
+ */
+ struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+ /* CAN ID is stored within the data field */
+ struct can_frame *cf = (struct can_frame *)skb->data;
+
+ return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+ u32 can_mask)
+{
+ int i;
+
+ /*
+ * Limit can_mask and can_id to SFF range to
+ * protect against write after end of array
+ */
+ can_mask &= CAN_SFF_MASK;
+ can_id &= can_mask;
+
+ /* Single frame */
+ if (can_mask == CAN_SFF_MASK) {
+ set_bit(can_id, cm->match_sff);
+ return;
+ }
+
+ /* All frames */
+ if (can_mask == 0) {
+ bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+ return;
+ }
+
+ /*
+ * Individual frame filter.
+ * Add record (set bit to 1) for each ID that
+ * conforms particular rule
+ */
+ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+ if ((i & can_mask) == can_id)
+ set_bit(i, cm->match_sff);
+ }
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+ return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct canid_match *cm = em_canid_priv(m);
+ canid_t can_id;
+ int match = 0;
+ int i;
+ const struct can_filter *lp;
+
+ can_id = em_canid_get_id(skb);
+
+ if (can_id & CAN_EFF_FLAG) {
+ for (i = 0, lp = cm->rules_raw;
+ i < cm->eff_rules_count; i++, lp++) {
+ if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+ match = 1;
+ break;
+ }
+ }
+ } else { /* SFF */
+ can_id &= CAN_SFF_MASK;
+ match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+ }
+
+ return match;
+}
+
+static int em_canid_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct can_filter *conf = data; /* Array with rules */
+ struct canid_match *cm;
+ int i;
+
+ if (!len)
+ return -EINVAL;
+
+ if (len % sizeof(struct can_filter))
+ return -EINVAL;
+
+ if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+ return -EINVAL;
+
+ cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
+ if (!cm)
+ return -ENOMEM;
+
+ cm->rules_count = len / sizeof(struct can_filter);
+
+ /*
+ * We need two for() loops for copying rules into two contiguous
+ * areas in rules_raw to process all eff rules with a simple loop.
+ * NB: The configuration interface supports sff and eff rules.
+ * We do not support filters here that match for the same can_id
+ * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
+ * For this (unusual case) two filters have to be specified. The
+ * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
+ */
+
+ /* Fill rules_raw with EFF rules first */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (conf[i].can_id & CAN_EFF_FLAG) {
+ memcpy(cm->rules_raw + cm->eff_rules_count,
+ &conf[i],
+ sizeof(struct can_filter));
+
+ cm->eff_rules_count++;
+ }
+ }
+
+ /* append SFF frame rules */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (!(conf[i].can_id & CAN_EFF_FLAG)) {
+ memcpy(cm->rules_raw
+ + cm->eff_rules_count
+ + cm->sff_rules_count,
+ &conf[i], sizeof(struct can_filter));
+
+ cm->sff_rules_count++;
+
+ em_canid_sff_match_add(cm,
+ conf[i].can_id, conf[i].can_mask);
+ }
+ }
+
+ m->datalen = sizeof(struct canid_match) + len;
+ m->data = (unsigned long)cm;
+ return 0;
+}
+
+static void em_canid_destroy(struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ /*
+ * When configuring this ematch 'rules_count' is set not to exceed
+ * 'rules_raw' array size
+ */
+ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+ &cm->rules_raw) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+ .kind = TCF_EM_CANID,
+ .change = em_canid_change,
+ .match = em_canid_match,
+ .destroy = em_canid_destroy,
+ .dump = em_canid_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+ return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+ tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
new file mode 100644
index 000000000..1c8360a27
--- /dev/null
+++ b/net/sched/em_cmp.c
@@ -0,0 +1,99 @@
+/*
+ * net/sched/em_cmp.c Simple packet data comparison ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_cmp.h>
+#include <asm/unaligned.h>
+#include <net/pkt_cls.h>
+
+static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
+{
+ return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
+}
+
+static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
+ unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+ u32 val = 0;
+
+ if (!tcf_valid_offset(skb, ptr, cmp->align))
+ return 0;
+
+ switch (cmp->align) {
+ case TCF_EM_ALIGN_U8:
+ val = *ptr;
+ break;
+
+ case TCF_EM_ALIGN_U16:
+ val = get_unaligned_be16(ptr);
+
+ if (cmp_needs_transformation(cmp))
+ val = be16_to_cpu(val);
+ break;
+
+ case TCF_EM_ALIGN_U32:
+ /* Worth checking boundries? The branching seems
+ * to get worse. Visit again.
+ */
+ val = get_unaligned_be32(ptr);
+
+ if (cmp_needs_transformation(cmp))
+ val = be32_to_cpu(val);
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (cmp->mask)
+ val &= cmp->mask;
+
+ switch (cmp->opnd) {
+ case TCF_EM_OPND_EQ:
+ return val == cmp->val;
+ case TCF_EM_OPND_LT:
+ return val < cmp->val;
+ case TCF_EM_OPND_GT:
+ return val > cmp->val;
+ }
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_cmp_ops = {
+ .kind = TCF_EM_CMP,
+ .datalen = sizeof(struct tcf_em_cmp),
+ .match = em_cmp_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_cmp_ops.link)
+};
+
+static int __init init_em_cmp(void)
+{
+ return tcf_em_register(&em_cmp_ops);
+}
+
+static void __exit exit_em_cmp(void)
+{
+ tcf_em_unregister(&em_cmp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_cmp);
+module_exit(exit_em_cmp);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
new file mode 100644
index 000000000..ef3b6b66c
--- /dev/null
+++ b/net/sched/em_ipset.c
@@ -0,0 +1,137 @@
+/*
+ * net/sched/em_ipset.c ipset ematch
+ *
+ * Copyright (c) 2012 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/pkt_cls.h>
+
+static int em_ipset_change(struct net *net, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct xt_set_info *set = data;
+ ip_set_id_t index;
+
+ if (data_len != sizeof(*set))
+ return -EINVAL;
+
+ index = ip_set_nfnl_get_byindex(net, set->index);
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ em->datalen = sizeof(*set);
+ em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+ if (em->data)
+ return 0;
+
+ ip_set_nfnl_put(net, index);
+ return -ENOMEM;
+}
+
+static void em_ipset_destroy(struct tcf_ematch *em)
+{
+ const struct xt_set_info *set = (const void *) em->data;
+ if (set) {
+ ip_set_nfnl_put(em->net, set->index);
+ kfree((void *) em->data);
+ }
+}
+
+static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct ip_set_adt_opt opt;
+ struct xt_action_param acpar;
+ const struct xt_set_info *set = (const void *) em->data;
+ struct net_device *dev, *indev = NULL;
+ struct nf_hook_state state = {
+ .net = em->net,
+ };
+ int ret, network_offset;
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ state.pf = NFPROTO_IPV4;
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ acpar.thoff = ip_hdrlen(skb);
+ break;
+ case htons(ETH_P_IPV6):
+ state.pf = NFPROTO_IPV6;
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
+ acpar.thoff = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ opt.family = state.pf;
+ opt.dim = set->dim;
+ opt.flags = set->flags;
+ opt.cmdflags = 0;
+ opt.ext.timeout = ~0u;
+
+ network_offset = skb_network_offset(skb);
+ skb_pull(skb, network_offset);
+
+ dev = skb->dev;
+
+ rcu_read_lock();
+
+ if (skb->skb_iif)
+ indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
+
+ state.in = indev ? indev : dev;
+ state.out = dev;
+ acpar.state = &state;
+
+ ret = ip_set_test(set->index, skb, &acpar, &opt);
+
+ rcu_read_unlock();
+
+ skb_push(skb, network_offset);
+ return ret;
+}
+
+static struct tcf_ematch_ops em_ipset_ops = {
+ .kind = TCF_EM_IPSET,
+ .change = em_ipset_change,
+ .destroy = em_ipset_destroy,
+ .match = em_ipset_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_ipset_ops.link)
+};
+
+static int __init init_em_ipset(void)
+{
+ return tcf_em_register(&em_ipset_ops);
+}
+
+static void __exit exit_em_ipset(void)
+{
+ tcf_em_unregister(&em_ipset_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("TC extended match for IP sets");
+
+module_init(init_em_ipset);
+module_exit(exit_em_ipset);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
new file mode 100644
index 000000000..a5f34e930
--- /dev/null
+++ b/net/sched/em_ipt.c
@@ -0,0 +1,257 @@
+/*
+ * net/sched/em_ipt.c IPtables matches Ematch
+ *
+ * (c) 2018 Eyal Birger <eyal.birger@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_ipt.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/pkt_cls.h>
+
+struct em_ipt_match {
+ const struct xt_match *match;
+ u32 hook;
+ u8 match_data[0] __aligned(8);
+};
+
+struct em_ipt_xt_match {
+ char *match_name;
+ int (*validate_match_data)(struct nlattr **tb, u8 mrev);
+};
+
+static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = {
+ [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING,
+ .len = XT_EXTENSION_MAXNAMELEN },
+ [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 },
+ [TCA_EM_IPT_HOOK] = { .type = NLA_U32 },
+ [TCA_EM_IPT_NFPROTO] = { .type = NLA_U8 },
+ [TCA_EM_IPT_MATCH_DATA] = { .type = NLA_UNSPEC },
+};
+
+static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len)
+{
+ struct xt_mtchk_param mtpar = {};
+ union {
+ struct ipt_entry e4;
+ struct ip6t_entry e6;
+ } e = {};
+
+ mtpar.net = net;
+ mtpar.table = "filter";
+ mtpar.hook_mask = 1 << im->hook;
+ mtpar.family = im->match->family;
+ mtpar.match = im->match;
+ mtpar.entryinfo = &e;
+ mtpar.matchinfo = (void *)im->match_data;
+ return xt_check_match(&mtpar, mdata_len, 0, 0);
+}
+
+static int policy_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+ if (mrev != 0) {
+ pr_err("only policy match revision 0 supported");
+ return -EINVAL;
+ }
+
+ if (nla_get_u32(tb[TCA_EM_IPT_HOOK]) != NF_INET_PRE_ROUTING) {
+ pr_err("policy can only be matched on NF_INET_PRE_ROUTING");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
+ {
+ .match_name = "policy",
+ .validate_match_data = policy_validate_match_data
+ },
+ {}
+};
+
+static struct xt_match *get_xt_match(struct nlattr **tb)
+{
+ const struct em_ipt_xt_match *m;
+ struct nlattr *mname_attr;
+ u8 nfproto, mrev = 0;
+ int ret;
+
+ mname_attr = tb[TCA_EM_IPT_MATCH_NAME];
+ for (m = em_ipt_xt_matches; m->match_name; m++) {
+ if (!nla_strcmp(mname_attr, m->match_name))
+ break;
+ }
+
+ if (!m->match_name) {
+ pr_err("Unsupported xt match");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (tb[TCA_EM_IPT_MATCH_REVISION])
+ mrev = nla_get_u8(tb[TCA_EM_IPT_MATCH_REVISION]);
+
+ ret = m->validate_match_data(tb, mrev);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+ return xt_request_find_match(nfproto, m->match_name, mrev);
+}
+
+static int em_ipt_change(struct net *net, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct nlattr *tb[TCA_EM_IPT_MAX + 1];
+ struct em_ipt_match *im = NULL;
+ struct xt_match *match;
+ int mdata_len, ret;
+
+ ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[TCA_EM_IPT_HOOK] || !tb[TCA_EM_IPT_MATCH_NAME] ||
+ !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
+ return -EINVAL;
+
+ match = get_xt_match(tb);
+ if (IS_ERR(match)) {
+ pr_err("unable to load match\n");
+ return PTR_ERR(match);
+ }
+
+ mdata_len = XT_ALIGN(nla_len(tb[TCA_EM_IPT_MATCH_DATA]));
+ im = kzalloc(sizeof(*im) + mdata_len, GFP_KERNEL);
+ if (!im) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ im->match = match;
+ im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+ nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
+
+ ret = check_match(net, im, mdata_len);
+ if (ret)
+ goto err;
+
+ em->datalen = sizeof(*im) + mdata_len;
+ em->data = (unsigned long)im;
+ return 0;
+
+err:
+ kfree(im);
+ module_put(match->me);
+ return ret;
+}
+
+static void em_ipt_destroy(struct tcf_ematch *em)
+{
+ struct em_ipt_match *im = (void *)em->data;
+
+ if (!im)
+ return;
+
+ if (im->match->destroy) {
+ struct xt_mtdtor_param par = {
+ .net = em->net,
+ .match = im->match,
+ .matchinfo = im->match_data,
+ .family = im->match->family
+ };
+ im->match->destroy(&par);
+ }
+ module_put(im->match->me);
+ kfree((void *)im);
+}
+
+static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ const struct em_ipt_match *im = (const void *)em->data;
+ struct xt_action_param acpar = {};
+ struct net_device *indev = NULL;
+ struct nf_hook_state state;
+ int ret;
+
+ rcu_read_lock();
+
+ if (skb->skb_iif)
+ indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
+
+ nf_hook_state_init(&state, im->hook, im->match->family,
+ indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
+
+ acpar.match = im->match;
+ acpar.matchinfo = im->match_data;
+ acpar.state = &state;
+
+ ret = im->match->match(skb, &acpar);
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+ struct em_ipt_match *im = (void *)em->data;
+
+ if (nla_put_string(skb, TCA_EM_IPT_MATCH_NAME, im->match->name) < 0)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, TCA_EM_IPT_HOOK, im->hook) < 0)
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+ return -EMSGSIZE;
+ if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
+ im->match->usersize ?: im->match->matchsize,
+ im->match_data) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_ipt_ops = {
+ .kind = TCF_EM_IPT,
+ .change = em_ipt_change,
+ .destroy = em_ipt_destroy,
+ .match = em_ipt_match,
+ .dump = em_ipt_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_ipt_ops.link)
+};
+
+static int __init init_em_ipt(void)
+{
+ return tcf_em_register(&em_ipt_ops);
+}
+
+static void __exit exit_em_ipt(void)
+{
+ tcf_em_unregister(&em_ipt_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eyal Birger <eyal.birger@gmail.com>");
+MODULE_DESCRIPTION("TC extended match for IPtables matches");
+
+module_init(init_em_ipt);
+module_exit(exit_em_ipt);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPT);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
new file mode 100644
index 000000000..e36fa9272
--- /dev/null
+++ b/net/sched/em_meta.c
@@ -0,0 +1,1014 @@
+/*
+ * net/sched/em_meta.c Metadata ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * The metadata ematch compares two meta objects where each object
+ * represents either a meta value stored in the kernel or a static
+ * value provided by userspace. The objects are not provided by
+ * userspace itself but rather a definition providing the information
+ * to build them. Every object is of a certain type which must be
+ * equal to the object it is being compared to.
+ *
+ * The definition of a objects conists of the type (meta type), a
+ * identifier (meta id) and additional type specific information.
+ * The meta id is either TCF_META_TYPE_VALUE for values provided by
+ * userspace or a index to the meta operations table consisting of
+ * function pointers to type specific meta data collectors returning
+ * the value of the requested meta value.
+ *
+ * lvalue rvalue
+ * +-----------+ +-----------+
+ * | type: INT | | type: INT |
+ * def | id: DEV | | id: VALUE |
+ * | data: | | data: 3 |
+ * +-----------+ +-----------+
+ * | |
+ * ---> meta_ops[INT][DEV](...) |
+ * | |
+ * ----------- |
+ * V V
+ * +-----------+ +-----------+
+ * | type: INT | | type: INT |
+ * obj | id: DEV | | id: VALUE |
+ * | data: 2 |<--data got filled out | data: 3 |
+ * +-----------+ +-----------+
+ * | |
+ * --------------> 2 equals 3 <--------------
+ *
+ * This is a simplified schema, the complexity varies depending
+ * on the meta type. Obviously, the length of the data must also
+ * be provided for non-numeric types.
+ *
+ * Additionally, type dependent modifiers such as shift operators
+ * or mask may be applied to extend the functionaliy. As of now,
+ * the variable length type supports shifting the byte string to
+ * the right, eating up any number of octets and thus supporting
+ * wildcard interface name comparisons such as "ppp%" matching
+ * ppp0..9.
+ *
+ * NOTE: Certain meta values depend on other subsystems and are
+ * only available if that subsystem is enabled in the kernel.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/if_vlan.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+struct meta_obj {
+ unsigned long value;
+ unsigned int len;
+};
+
+struct meta_value {
+ struct tcf_meta_val hdr;
+ unsigned long val;
+ unsigned int len;
+};
+
+struct meta_match {
+ struct meta_value lvalue;
+ struct meta_value rvalue;
+};
+
+static inline int meta_id(struct meta_value *v)
+{
+ return TCF_META_ID(v->hdr.kind);
+}
+
+static inline int meta_type(struct meta_value *v)
+{
+ return TCF_META_TYPE(v->hdr.kind);
+}
+
+#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
+ struct tcf_pkt_info *info, struct meta_value *v, \
+ struct meta_obj *dst, int *err)
+
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+
+META_COLLECTOR(int_random)
+{
+ get_random_bytes(&dst->value, sizeof(dst->value));
+}
+
+static inline unsigned long fixed_loadavg(int load)
+{
+ int rnd_load = load + (FIXED_1/200);
+ int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
+
+ return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
+}
+
+META_COLLECTOR(int_loadavg_0)
+{
+ dst->value = fixed_loadavg(avenrun[0]);
+}
+
+META_COLLECTOR(int_loadavg_1)
+{
+ dst->value = fixed_loadavg(avenrun[1]);
+}
+
+META_COLLECTOR(int_loadavg_2)
+{
+ dst->value = fixed_loadavg(avenrun[2]);
+}
+
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+
+static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = dev->ifindex;
+ return 0;
+}
+
+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = (unsigned long) dev->name;
+ dst->len = strlen(dev->name);
+ return 0;
+}
+
+META_COLLECTOR(int_dev)
+{
+ *err = int_dev(skb->dev, dst);
+}
+
+META_COLLECTOR(var_dev)
+{
+ *err = var_dev(skb->dev, dst);
+}
+
+/**************************************************************************
+ * vlan tag
+ **************************************************************************/
+
+META_COLLECTOR(int_vlan_tag)
+{
+ unsigned short tag;
+
+ if (skb_vlan_tag_present(skb))
+ dst->value = skb_vlan_tag_get(skb);
+ else if (!__vlan_get_tag(skb, &tag))
+ dst->value = tag;
+ else
+ *err = -1;
+}
+
+
+
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+
+META_COLLECTOR(int_priority)
+{
+ dst->value = skb->priority;
+}
+
+META_COLLECTOR(int_protocol)
+{
+ /* Let userspace take care of the byte ordering */
+ dst->value = skb_protocol(skb, false);
+}
+
+META_COLLECTOR(int_pkttype)
+{
+ dst->value = skb->pkt_type;
+}
+
+META_COLLECTOR(int_pktlen)
+{
+ dst->value = skb->len;
+}
+
+META_COLLECTOR(int_datalen)
+{
+ dst->value = skb->data_len;
+}
+
+META_COLLECTOR(int_maclen)
+{
+ dst->value = skb->mac_len;
+}
+
+META_COLLECTOR(int_rxhash)
+{
+ dst->value = skb_get_hash(skb);
+}
+
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+
+META_COLLECTOR(int_mark)
+{
+ dst->value = skb->mark;
+}
+
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+
+META_COLLECTOR(int_tcindex)
+{
+ dst->value = skb->tc_index;
+}
+
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+
+META_COLLECTOR(int_rtclassid)
+{
+ if (unlikely(skb_dst(skb) == NULL))
+ *err = -1;
+ else
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->value = skb_dst(skb)->tclassid;
+#else
+ dst->value = 0;
+#endif
+}
+
+META_COLLECTOR(int_rtiif)
+{
+ if (unlikely(skb_rtable(skb) == NULL))
+ *err = -1;
+ else
+ dst->value = inet_iif(skb);
+}
+
+/**************************************************************************
+ * Socket Attributes
+ **************************************************************************/
+
+#define skip_nonlocal(skb) \
+ (unlikely(skb->sk == NULL))
+
+META_COLLECTOR(int_sk_family)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_family;
+}
+
+META_COLLECTOR(int_sk_state)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_state;
+}
+
+META_COLLECTOR(int_sk_reuse)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_reuse;
+}
+
+META_COLLECTOR(int_sk_bound_if)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ /* No error if bound_dev_if is 0, legal userspace check */
+ dst->value = skb->sk->sk_bound_dev_if;
+}
+
+META_COLLECTOR(var_sk_bound_if)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+
+ if (skb->sk->sk_bound_dev_if == 0) {
+ dst->value = (unsigned long) "any";
+ dst->len = 3;
+ } else {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(skb->sk),
+ skb->sk->sk_bound_dev_if);
+ *err = var_dev(dev, dst);
+ rcu_read_unlock();
+ }
+}
+
+META_COLLECTOR(int_sk_refcnt)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = refcount_read(&skb->sk->sk_refcnt);
+}
+
+META_COLLECTOR(int_sk_rcvbuf)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_rcvbuf;
+}
+
+META_COLLECTOR(int_sk_shutdown)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_shutdown;
+}
+
+META_COLLECTOR(int_sk_proto)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_protocol;
+}
+
+META_COLLECTOR(int_sk_type)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_type;
+}
+
+META_COLLECTOR(int_sk_rmem_alloc)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk_rmem_alloc_get(sk);
+}
+
+META_COLLECTOR(int_sk_wmem_alloc)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk_wmem_alloc_get(sk);
+}
+
+META_COLLECTOR(int_sk_omem_alloc)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = atomic_read(&sk->sk_omem_alloc);
+}
+
+META_COLLECTOR(int_sk_rcv_qlen)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_receive_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_snd_qlen)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_write_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_wmem_queued)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_wmem_queued;
+}
+
+META_COLLECTOR(int_sk_fwd_alloc)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_forward_alloc;
+}
+
+META_COLLECTOR(int_sk_sndbuf)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_sndbuf;
+}
+
+META_COLLECTOR(int_sk_alloc)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = (__force int) sk->sk_allocation;
+}
+
+META_COLLECTOR(int_sk_hash)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_hash;
+}
+
+META_COLLECTOR(int_sk_lingertime)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_lingertime / HZ;
+}
+
+META_COLLECTOR(int_sk_err_qlen)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_error_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_ack_bl)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_max_ack_bl)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_max_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_prio)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_priority;
+}
+
+META_COLLECTOR(int_sk_rcvlowat)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_rcvlowat;
+}
+
+META_COLLECTOR(int_sk_rcvtimeo)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_rcvtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sndtimeo)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_sndtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sendmsg_off)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_frag.offset;
+}
+
+META_COLLECTOR(int_sk_write_pend)
+{
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk->sk_write_pending;
+}
+
+/**************************************************************************
+ * Meta value collectors assignment table
+ **************************************************************************/
+
+struct meta_ops {
+ void (*get)(struct sk_buff *, struct tcf_pkt_info *,
+ struct meta_value *, struct meta_obj *, int *);
+};
+
+#define META_ID(name) TCF_META_ID_##name
+#define META_FUNC(name) { .get = meta_##name }
+
+/* Meta value operations table listing all meta value collectors and
+ * assigns them to a type and meta id. */
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
+ [TCF_META_TYPE_VAR] = {
+ [META_ID(DEV)] = META_FUNC(var_dev),
+ [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
+ },
+ [TCF_META_TYPE_INT] = {
+ [META_ID(RANDOM)] = META_FUNC(int_random),
+ [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
+ [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
+ [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
+ [META_ID(DEV)] = META_FUNC(int_dev),
+ [META_ID(PRIORITY)] = META_FUNC(int_priority),
+ [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
+ [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
+ [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
+ [META_ID(DATALEN)] = META_FUNC(int_datalen),
+ [META_ID(MACLEN)] = META_FUNC(int_maclen),
+ [META_ID(NFMARK)] = META_FUNC(int_mark),
+ [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
+ [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
+ [META_ID(RTIIF)] = META_FUNC(int_rtiif),
+ [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
+ [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
+ [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
+ [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
+ [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
+ [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
+ [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
+ [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
+ [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
+ [META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
+ [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
+ [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
+ [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
+ [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
+ [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
+ [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
+ [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
+ [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
+ [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
+ [META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
+ [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
+ [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
+ [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
+ [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
+ [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
+ [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
+ [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
+ [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
+ [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
+ [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
+ [META_ID(RXHASH)] = META_FUNC(int_rxhash),
+ }
+};
+
+static inline struct meta_ops *meta_ops(struct meta_value *val)
+{
+ return &__meta_ops[meta_type(val)][meta_id(val)];
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_VAR
+ **************************************************************************/
+
+static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ int r = a->len - b->len;
+
+ if (r == 0)
+ r = memcmp((void *) a->value, (void *) b->value, a->len);
+
+ return r;
+}
+
+static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
+{
+ int len = nla_len(nla);
+
+ dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
+ if (dst->val == 0UL)
+ return -ENOMEM;
+ dst->len = len;
+ return 0;
+}
+
+static void meta_var_destroy(struct meta_value *v)
+{
+ kfree((void *) v->val);
+}
+
+static void meta_var_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ int shift = v->hdr.shift;
+
+ if (shift && shift < dst->len)
+ dst->len -= shift;
+}
+
+static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+ if (v->val && v->len &&
+ nla_put(skb, tlv, v->len, (void *) v->val))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_INT
+ **************************************************************************/
+
+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ /* Let gcc optimize it, the unlikely is not really based on
+ * some numbers but jump free code for mismatches seems
+ * more logical. */
+ if (unlikely(a->value == b->value))
+ return 0;
+ else if (a->value < b->value)
+ return -1;
+ else
+ return 1;
+}
+
+static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
+{
+ if (nla_len(nla) >= sizeof(unsigned long)) {
+ dst->val = *(unsigned long *) nla_data(nla);
+ dst->len = sizeof(unsigned long);
+ } else if (nla_len(nla) == sizeof(u32)) {
+ dst->val = nla_get_u32(nla);
+ dst->len = sizeof(u32);
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static void meta_int_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ if (v->hdr.shift)
+ dst->value >>= v->hdr.shift;
+
+ if (v->val)
+ dst->value &= v->val;
+}
+
+static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+ if (v->len == sizeof(unsigned long)) {
+ if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
+ goto nla_put_failure;
+ } else if (v->len == sizeof(u32)) {
+ if (nla_put_u32(skb, tlv, v->val))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+/**************************************************************************
+ * Type specific operations table
+ **************************************************************************/
+
+struct meta_type_ops {
+ void (*destroy)(struct meta_value *);
+ int (*compare)(struct meta_obj *, struct meta_obj *);
+ int (*change)(struct meta_value *, struct nlattr *);
+ void (*apply_extras)(struct meta_value *, struct meta_obj *);
+ int (*dump)(struct sk_buff *, struct meta_value *, int);
+};
+
+static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
+ [TCF_META_TYPE_VAR] = {
+ .destroy = meta_var_destroy,
+ .compare = meta_var_compare,
+ .change = meta_var_change,
+ .apply_extras = meta_var_apply_extras,
+ .dump = meta_var_dump
+ },
+ [TCF_META_TYPE_INT] = {
+ .compare = meta_int_compare,
+ .change = meta_int_change,
+ .apply_extras = meta_int_apply_extras,
+ .dump = meta_int_dump
+ }
+};
+
+static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v)
+{
+ return &__meta_type_ops[meta_type(v)];
+}
+
+/**************************************************************************
+ * Core
+ **************************************************************************/
+
+static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ int err = 0;
+
+ if (meta_id(v) == TCF_META_ID_VALUE) {
+ dst->value = v->val;
+ dst->len = v->len;
+ return 0;
+ }
+
+ meta_ops(v)->get(skb, info, v, dst, &err);
+ if (err < 0)
+ return err;
+
+ if (meta_type_ops(v)->apply_extras)
+ meta_type_ops(v)->apply_extras(v, dst);
+
+ return 0;
+}
+
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ int r;
+ struct meta_match *meta = (struct meta_match *) m->data;
+ struct meta_obj l_value, r_value;
+
+ if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+ meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+ return 0;
+
+ r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+
+ switch (meta->lvalue.hdr.op) {
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
+ }
+
+ return 0;
+}
+
+static void meta_delete(struct meta_match *meta)
+{
+ if (meta) {
+ const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+
+ if (ops && ops->destroy) {
+ ops->destroy(&meta->lvalue);
+ ops->destroy(&meta->rvalue);
+ }
+ }
+
+ kfree(meta);
+}
+
+static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
+{
+ if (nla) {
+ if (nla_len(nla) == 0)
+ return -EINVAL;
+
+ return meta_type_ops(dst)->change(dst, nla);
+ }
+
+ return 0;
+}
+
+static inline int meta_is_supported(struct meta_value *val)
+{
+ return !meta_id(val) || meta_ops(val)->get;
+}
+
+static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
+ [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) },
+};
+
+static int em_meta_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ int err;
+ struct nlattr *tb[TCA_EM_META_MAX + 1];
+ struct tcf_meta_hdr *hdr;
+ struct meta_match *meta = NULL;
+
+ err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ if (tb[TCA_EM_META_HDR] == NULL)
+ goto errout;
+ hdr = nla_data(tb[TCA_EM_META_HDR]);
+
+ if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+ TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+ TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+ TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+ goto errout;
+
+ meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+ if (meta == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+ memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+
+ if (!meta_is_supported(&meta->lvalue) ||
+ !meta_is_supported(&meta->rvalue)) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
+ meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
+ goto errout;
+
+ m->datalen = sizeof(*meta);
+ m->data = (unsigned long) meta;
+
+ err = 0;
+errout:
+ if (err && meta)
+ meta_delete(meta);
+ return err;
+}
+
+static void em_meta_destroy(struct tcf_ematch *m)
+{
+ if (m)
+ meta_delete((struct meta_match *) m->data);
+}
+
+static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+ struct meta_match *meta = (struct meta_match *) em->data;
+ struct tcf_meta_hdr hdr;
+ const struct meta_type_ops *ops;
+
+ memset(&hdr, 0, sizeof(hdr));
+ memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
+ memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
+
+ if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
+ goto nla_put_failure;
+
+ ops = meta_type_ops(&meta->lvalue);
+ if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
+ ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_meta_ops = {
+ .kind = TCF_EM_META,
+ .change = em_meta_change,
+ .match = em_meta_match,
+ .destroy = em_meta_destroy,
+ .dump = em_meta_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_meta_ops.link)
+};
+
+static int __init init_em_meta(void)
+{
+ return tcf_em_register(&em_meta_ops);
+}
+
+static void __exit exit_em_meta(void)
+{
+ tcf_em_unregister(&em_meta_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_meta);
+module_exit(exit_em_meta);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
new file mode 100644
index 000000000..07c10bac0
--- /dev/null
+++ b/net/sched/em_nbyte.c
@@ -0,0 +1,80 @@
+/*
+ * net/sched/em_nbyte.c N-Byte ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_nbyte.h>
+#include <net/pkt_cls.h>
+
+struct nbyte_data {
+ struct tcf_em_nbyte hdr;
+ char pattern[0];
+};
+
+static int em_nbyte_change(struct net *net, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct tcf_em_nbyte *nbyte = data;
+
+ if (data_len < sizeof(*nbyte) ||
+ data_len < (sizeof(*nbyte) + nbyte->len))
+ return -EINVAL;
+
+ em->datalen = sizeof(*nbyte) + nbyte->len;
+ em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+ if (em->data == 0UL)
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
+ unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+
+ ptr += nbyte->hdr.off;
+
+ if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+ return 0;
+
+ return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
+}
+
+static struct tcf_ematch_ops em_nbyte_ops = {
+ .kind = TCF_EM_NBYTE,
+ .change = em_nbyte_change,
+ .match = em_nbyte_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_nbyte_ops.link)
+};
+
+static int __init init_em_nbyte(void)
+{
+ return tcf_em_register(&em_nbyte_ops);
+}
+
+static void __exit exit_em_nbyte(void)
+{
+ tcf_em_unregister(&em_nbyte_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_nbyte);
+module_exit(exit_em_nbyte);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000..73e2ed576
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c Textsearch ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match {
+ u16 from_offset;
+ u16 to_offset;
+ u8 from_layer;
+ u8 to_layer;
+ struct ts_config *config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ int from, to;
+
+ from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+ from += tm->from_offset;
+
+ to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+ to += tm->to_offset;
+
+ return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
+}
+
+static int em_text_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct text_match *tm;
+ struct tcf_em_text *conf = data;
+ struct ts_config *ts_conf;
+ int flags = 0;
+
+ if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+ return -EINVAL;
+
+ if (conf->from_layer > conf->to_layer)
+ return -EINVAL;
+
+ if (conf->from_layer == conf->to_layer &&
+ conf->from_offset > conf->to_offset)
+ return -EINVAL;
+
+retry:
+ ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+ conf->pattern_len, GFP_KERNEL, flags);
+
+ if (flags & TS_AUTOLOAD)
+ rtnl_lock();
+
+ if (IS_ERR(ts_conf)) {
+ if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+ rtnl_unlock();
+ flags |= TS_AUTOLOAD;
+ goto retry;
+ } else
+ return PTR_ERR(ts_conf);
+ } else if (flags & TS_AUTOLOAD) {
+ textsearch_destroy(ts_conf);
+ return -EAGAIN;
+ }
+
+ tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+ if (tm == NULL) {
+ textsearch_destroy(ts_conf);
+ return -ENOBUFS;
+ }
+
+ tm->from_offset = conf->from_offset;
+ tm->to_offset = conf->to_offset;
+ tm->from_layer = conf->from_layer;
+ tm->to_layer = conf->to_layer;
+ tm->config = ts_conf;
+
+ m->datalen = sizeof(*tm);
+ m->data = (unsigned long) tm;
+
+ return 0;
+}
+
+static void em_text_destroy(struct tcf_ematch *m)
+{
+ if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
+ textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ struct tcf_em_text conf;
+
+ strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+ conf.from_offset = tm->from_offset;
+ conf.to_offset = tm->to_offset;
+ conf.from_layer = tm->from_layer;
+ conf.to_layer = tm->to_layer;
+ conf.pattern_len = textsearch_get_pattern_len(tm->config);
+ conf.pad = 0;
+
+ if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
+ goto nla_put_failure;
+ if (nla_append(skb, conf.pattern_len,
+ textsearch_get_pattern(tm->config)) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_text_ops = {
+ .kind = TCF_EM_TEXT,
+ .change = em_text_change,
+ .match = em_text_match,
+ .destroy = em_text_destroy,
+ .dump = em_text_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+ return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void)
+{
+ tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
new file mode 100644
index 000000000..797bdb88c
--- /dev/null
+++ b/net/sched/em_u32.c
@@ -0,0 +1,64 @@
+/*
+ * net/sched/em_u32.c U32 Ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Based on net/sched/cls_u32.c
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct tc_u32_key *key = (struct tc_u32_key *) em->data;
+ const unsigned char *ptr = skb_network_header(skb);
+
+ if (info) {
+ if (info->ptr)
+ ptr = info->ptr;
+ ptr += (info->nexthdr & key->offmask);
+ }
+
+ ptr += key->off;
+
+ if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
+ return 0;
+
+ return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
+}
+
+static struct tcf_ematch_ops em_u32_ops = {
+ .kind = TCF_EM_U32,
+ .datalen = sizeof(struct tc_u32_key),
+ .match = em_u32_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_u32_ops.link)
+};
+
+static int __init init_em_u32(void)
+{
+ return tcf_em_register(&em_u32_ops);
+}
+
+static void __exit exit_em_u32(void)
+{
+ tcf_em_unregister(&em_u32_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_u32);
+module_exit(exit_em_u32);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
new file mode 100644
index 000000000..113a133ee
--- /dev/null
+++ b/net/sched/ematch.c
@@ -0,0 +1,552 @@
+/*
+ * net/sched/ematch.c Extended Match API
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * An extended match (ematch) is a small classification tool not worth
+ * writing a full classifier for. Ematches can be interconnected to form
+ * a logic expression and get attached to classifiers to extend their
+ * functionatlity.
+ *
+ * The userspace part transforms the logic expressions into an array
+ * consisting of multiple sequences of interconnected ematches separated
+ * by markers. Precedence is implemented by a special ematch kind
+ * referencing a sequence beyond the marker of the current sequence
+ * causing the current position in the sequence to be pushed onto a stack
+ * to allow the current position to be overwritten by the position referenced
+ * in the special ematch. Matching continues in the new sequence until a
+ * marker is reached causing the position to be restored from the stack.
+ *
+ * Example:
+ * A AND (B1 OR B2) AND C AND D
+ *
+ * ------->-PUSH-------
+ * -->-- / -->-- \ -->--
+ * / \ / / \ \ / \
+ * +-------+-------+-------+-------+-------+--------+
+ * | A AND | B AND | C AND | D END | B1 OR | B2 END |
+ * +-------+-------+-------+-------+-------+--------+
+ * \ /
+ * --------<-POP---------
+ *
+ * where B is a virtual ematch referencing to sequence starting with B1.
+ *
+ * ==========================================================================
+ *
+ * How to write an ematch in 60 seconds
+ * ------------------------------------
+ *
+ * 1) Provide a matcher function:
+ * static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
+ * struct tcf_pkt_info *info)
+ * {
+ * struct mydata *d = (struct mydata *) m->data;
+ *
+ * if (...matching goes here...)
+ * return 1;
+ * else
+ * return 0;
+ * }
+ *
+ * 2) Fill out a struct tcf_ematch_ops:
+ * static struct tcf_ematch_ops my_ops = {
+ * .kind = unique id,
+ * .datalen = sizeof(struct mydata),
+ * .match = my_match,
+ * .owner = THIS_MODULE,
+ * };
+ *
+ * 3) Register/Unregister your ematch:
+ * static int __init init_my_ematch(void)
+ * {
+ * return tcf_em_register(&my_ops);
+ * }
+ *
+ * static void __exit exit_my_ematch(void)
+ * {
+ * tcf_em_unregister(&my_ops);
+ * }
+ *
+ * module_init(init_my_ematch);
+ * module_exit(exit_my_ematch);
+ *
+ * 4) By now you should have two more seconds left, barely enough to
+ * open up a beer to watch the compilation going.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static LIST_HEAD(ematch_ops);
+static DEFINE_RWLOCK(ematch_mod_lock);
+
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
+{
+ struct tcf_ematch_ops *e = NULL;
+
+ read_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (kind == e->kind) {
+ if (!try_module_get(e->owner))
+ e = NULL;
+ read_unlock(&ematch_mod_lock);
+ return e;
+ }
+ }
+ read_unlock(&ematch_mod_lock);
+
+ return NULL;
+}
+
+/**
+ * tcf_em_register - register an extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their presence.
+ * The given @ops must have kind set to a unique identifier and the
+ * callback match() must be implemented. All other callbacks are optional
+ * and a fallback implementation is used instead.
+ *
+ * Returns -EEXISTS if an ematch of the same kind has already registered.
+ */
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+ int err = -EEXIST;
+ struct tcf_ematch_ops *e;
+
+ if (ops->match == NULL)
+ return -EINVAL;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link)
+ if (ops->kind == e->kind)
+ goto errout;
+
+ list_add_tail(&ops->link, &ematch_ops);
+ err = 0;
+errout:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+EXPORT_SYMBOL(tcf_em_register);
+
+/**
+ * tcf_em_unregister - unregster and extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their disappearance
+ * for examples when the module gets unloaded. The @ops parameter must be
+ * the same as the one used for registration.
+ *
+ * Returns -ENOENT if no matching ematch was found.
+ */
+void tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+ write_lock(&ematch_mod_lock);
+ list_del(&ops->link);
+ write_unlock(&ematch_mod_lock);
+}
+EXPORT_SYMBOL(tcf_em_unregister);
+
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+ int index)
+{
+ return &tree->matches[index];
+}
+
+
+static int tcf_em_validate(struct tcf_proto *tp,
+ struct tcf_ematch_tree_hdr *tree_hdr,
+ struct tcf_ematch *em, struct nlattr *nla, int idx)
+{
+ int err = -EINVAL;
+ struct tcf_ematch_hdr *em_hdr = nla_data(nla);
+ int data_len = nla_len(nla) - sizeof(*em_hdr);
+ void *data = (void *) em_hdr + sizeof(*em_hdr);
+ struct net *net = tp->chain->block->net;
+
+ if (!TCF_EM_REL_VALID(em_hdr->flags))
+ goto errout;
+
+ if (em_hdr->kind == TCF_EM_CONTAINER) {
+ /* Special ematch called "container", carries an index
+ * referencing an external ematch sequence.
+ */
+ u32 ref;
+
+ if (data_len < sizeof(ref))
+ goto errout;
+ ref = *(u32 *) data;
+
+ if (ref >= tree_hdr->nmatches)
+ goto errout;
+
+ /* We do not allow backward jumps to avoid loops and jumps
+ * to our own position are of course illegal.
+ */
+ if (ref <= idx)
+ goto errout;
+
+
+ em->data = ref;
+ } else {
+ /* Note: This lookup will increase the module refcnt
+ * of the ematch module referenced. In case of a failure,
+ * a destroy function is called by the underlying layer
+ * which automatically releases the reference again, therefore
+ * the module MUST not be given back under any circumstances
+ * here. Be aware, the destroy function assumes that the
+ * module is held if the ops field is non zero.
+ */
+ em->ops = tcf_em_lookup(em_hdr->kind);
+
+ if (em->ops == NULL) {
+ err = -ENOENT;
+#ifdef CONFIG_MODULES
+ __rtnl_unlock();
+ request_module("ematch-kind-%u", em_hdr->kind);
+ rtnl_lock();
+ em->ops = tcf_em_lookup(em_hdr->kind);
+ if (em->ops) {
+ /* We dropped the RTNL mutex in order to
+ * perform the module load. Tell the caller
+ * to replay the request.
+ */
+ module_put(em->ops->owner);
+ em->ops = NULL;
+ err = -EAGAIN;
+ }
+#endif
+ goto errout;
+ }
+
+ /* ematch module provides expected length of data, so we
+ * can do a basic sanity check.
+ */
+ if (em->ops->datalen && data_len < em->ops->datalen)
+ goto errout;
+
+ if (em->ops->change) {
+ err = -EINVAL;
+ if (em_hdr->flags & TCF_EM_SIMPLE)
+ goto errout;
+ err = em->ops->change(net, data, data_len, em);
+ if (err < 0)
+ goto errout;
+ } else if (data_len > 0) {
+ /* ematch module doesn't provide an own change
+ * procedure and expects us to allocate and copy
+ * the ematch data.
+ *
+ * TCF_EM_SIMPLE may be specified stating that the
+ * data only consists of a u32 integer and the module
+ * does not expected a memory reference but rather
+ * the value carried.
+ */
+ if (em_hdr->flags & TCF_EM_SIMPLE) {
+ if (data_len < sizeof(u32))
+ goto errout;
+ em->data = *(u32 *) data;
+ } else {
+ void *v = kmemdup(data, data_len, GFP_KERNEL);
+ if (v == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+ em->data = (unsigned long) v;
+ }
+ em->datalen = data_len;
+ }
+ }
+
+ em->matchid = em_hdr->matchid;
+ em->flags = em_hdr->flags;
+ em->net = net;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
+ [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) },
+ [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED },
+};
+
+/**
+ * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
+ *
+ * @tp: classifier kind handle
+ * @nla: ematch tree configuration TLV
+ * @tree: destination ematch tree variable to store the resulting
+ * ematch tree.
+ *
+ * This function validates the given configuration TLV @nla and builds an
+ * ematch tree in @tree. The resulting tree must later be copied into
+ * the private classifier data using tcf_em_tree_change(). You MUST NOT
+ * provide the ematch tree variable of the private classifier data directly,
+ * the changes would not be locked properly.
+ *
+ * Returns a negative error code if the configuration TLV contains errors.
+ */
+int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
+ struct tcf_ematch_tree *tree)
+{
+ int idx, list_len, matches_len, err;
+ struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
+ struct nlattr *rt_match, *rt_hdr, *rt_list;
+ struct tcf_ematch_tree_hdr *tree_hdr;
+ struct tcf_ematch *em;
+
+ memset(tree, 0, sizeof(*tree));
+ if (!nla)
+ return 0;
+
+ err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ rt_hdr = tb[TCA_EMATCH_TREE_HDR];
+ rt_list = tb[TCA_EMATCH_TREE_LIST];
+
+ if (rt_hdr == NULL || rt_list == NULL)
+ goto errout;
+
+ tree_hdr = nla_data(rt_hdr);
+ memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
+
+ rt_match = nla_data(rt_list);
+ list_len = nla_len(rt_list);
+ matches_len = tree_hdr->nmatches * sizeof(*em);
+
+ tree->matches = kzalloc(matches_len, GFP_KERNEL);
+ if (tree->matches == NULL)
+ goto errout;
+
+ /* We do not use nla_parse_nested here because the maximum
+ * number of attributes is unknown. This saves us the allocation
+ * for a tb buffer which would serve no purpose at all.
+ *
+ * The array of rt attributes is parsed in the order as they are
+ * provided, their type must be incremental from 1 to n. Even
+ * if it does not serve any real purpose, a failure of sticking
+ * to this policy will result in parsing failure.
+ */
+ for (idx = 0; nla_ok(rt_match, list_len); idx++) {
+ err = -EINVAL;
+
+ if (rt_match->nla_type != (idx + 1))
+ goto errout_abort;
+
+ if (idx >= tree_hdr->nmatches)
+ goto errout_abort;
+
+ if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
+ goto errout_abort;
+
+ em = tcf_em_get_match(tree, idx);
+
+ err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
+ if (err < 0)
+ goto errout_abort;
+
+ rt_match = nla_next(rt_match, &list_len);
+ }
+
+ /* Check if the number of matches provided by userspace actually
+ * complies with the array of matches. The number was used for
+ * the validation of references and a mismatch could lead to
+ * undefined references during the matching process.
+ */
+ if (idx != tree_hdr->nmatches) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+ err = 0;
+errout:
+ return err;
+
+errout_abort:
+ tcf_em_tree_destroy(tree);
+ return err;
+}
+EXPORT_SYMBOL(tcf_em_tree_validate);
+
+/**
+ * tcf_em_tree_destroy - destroy an ematch tree
+ *
+ * @tp: classifier kind handle
+ * @tree: ematch tree to be deleted
+ *
+ * This functions destroys an ematch tree previously created by
+ * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
+ * the ematch tree is not in use before calling this function.
+ */
+void tcf_em_tree_destroy(struct tcf_ematch_tree *tree)
+{
+ int i;
+
+ if (tree->matches == NULL)
+ return;
+
+ for (i = 0; i < tree->hdr.nmatches; i++) {
+ struct tcf_ematch *em = tcf_em_get_match(tree, i);
+
+ if (em->ops) {
+ if (em->ops->destroy)
+ em->ops->destroy(em);
+ else if (!tcf_em_is_simple(em))
+ kfree((void *) em->data);
+ module_put(em->ops->owner);
+ }
+ }
+
+ tree->hdr.nmatches = 0;
+ kfree(tree->matches);
+ tree->matches = NULL;
+}
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+
+/**
+ * tcf_em_tree_dump - dump ematch tree into a rtnl message
+ *
+ * @skb: skb holding the rtnl message
+ * @t: ematch tree to be dumped
+ * @tlv: TLV type to be used to encapsulate the tree
+ *
+ * This function dumps a ematch tree into a rtnl message. It is valid to
+ * call this function while the ematch tree is in use.
+ *
+ * Returns -1 if the skb tailroom is insufficient.
+ */
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
+{
+ int i;
+ u8 *tail;
+ struct nlattr *top_start;
+ struct nlattr *list_start;
+
+ top_start = nla_nest_start(skb, tlv);
+ if (top_start == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
+ goto nla_put_failure;
+
+ list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+ if (list_start == NULL)
+ goto nla_put_failure;
+
+ tail = skb_tail_pointer(skb);
+ for (i = 0; i < tree->hdr.nmatches; i++) {
+ struct nlattr *match_start = (struct nlattr *)tail;
+ struct tcf_ematch *em = tcf_em_get_match(tree, i);
+ struct tcf_ematch_hdr em_hdr = {
+ .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
+ .matchid = em->matchid,
+ .flags = em->flags
+ };
+
+ if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
+ goto nla_put_failure;
+
+ if (em->ops && em->ops->dump) {
+ if (em->ops->dump(skb, em) < 0)
+ goto nla_put_failure;
+ } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
+ u32 u = em->data;
+ nla_put_nohdr(skb, sizeof(u), &u);
+ } else if (em->datalen > 0)
+ nla_put_nohdr(skb, em->datalen, (void *) em->data);
+
+ tail = skb_tail_pointer(skb);
+ match_start->nla_len = tail - (u8 *)match_start;
+ }
+
+ nla_nest_end(skb, list_start);
+ nla_nest_end(skb, top_start);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL(tcf_em_tree_dump);
+
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ int r = em->ops->match(skb, em, info);
+
+ return tcf_em_is_inverted(em) ? !r : r;
+}
+
+/* Do not use this function directly, use tcf_em_tree_match instead */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
+ struct tcf_pkt_info *info)
+{
+ int stackp = 0, match_idx = 0, res = 0;
+ struct tcf_ematch *cur_match;
+ int stack[CONFIG_NET_EMATCH_STACK];
+
+proceed:
+ while (match_idx < tree->hdr.nmatches) {
+ cur_match = tcf_em_get_match(tree, match_idx);
+
+ if (tcf_em_is_container(cur_match)) {
+ if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
+ goto stack_overflow;
+
+ stack[stackp++] = match_idx;
+ match_idx = cur_match->data;
+ goto proceed;
+ }
+
+ res = tcf_em_match(skb, cur_match, info);
+
+ if (tcf_em_early_end(cur_match, res))
+ break;
+
+ match_idx++;
+ }
+
+pop_stack:
+ if (stackp > 0) {
+ match_idx = stack[--stackp];
+ cur_match = tcf_em_get_match(tree, match_idx);
+
+ if (tcf_em_is_inverted(cur_match))
+ res = !res;
+
+ if (tcf_em_early_end(cur_match, res)) {
+ goto pop_stack;
+ } else {
+ match_idx++;
+ goto proceed;
+ }
+ }
+
+ return res;
+
+stack_overflow:
+ net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
+ return -1;
+}
+EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
new file mode 100644
index 000000000..424e70907
--- /dev/null
+++ b/net/sched/sch_api.c
@@ -0,0 +1,2209 @@
+/*
+ * net/sched/sch_api.c Packet scheduler API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/hrtimer.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+/*
+
+ Short review.
+ -------------
+
+ This file consists of two interrelated parts:
+
+ 1. queueing disciplines manager frontend.
+ 2. traffic classes manager frontend.
+
+ Generally, queueing discipline ("qdisc") is a black box,
+ which is able to enqueue packets and to dequeue them (when
+ device is ready to send something) in order and at times
+ determined by algorithm hidden in it.
+
+ qdisc's are divided to two categories:
+ - "queues", which have no internal structure visible from outside.
+ - "schedulers", which split all the packets to "traffic classes",
+ using "packet classifiers" (look at cls_api.c)
+
+ In turn, classes may have child qdiscs (as rule, queues)
+ attached to them etc. etc. etc.
+
+ The goal of the routines in this file is to translate
+ information supplied by user in the form of handles
+ to more intelligible for kernel form, to make some sanity
+ checks and part of work, which is common to all qdiscs
+ and to provide rtnetlink notifications.
+
+ All real intelligent work is done inside qdisc modules.
+
+
+
+ Every discipline has two major routines: enqueue and dequeue.
+
+ ---dequeue
+
+ dequeue usually returns a skb to send. It is allowed to return NULL,
+ but it does not mean that queue is empty, it just means that
+ discipline does not want to send anything this time.
+ Queue is really empty if q->q.qlen == 0.
+ For complicated disciplines with multiple queues q->q is not
+ real packet queue, but however q->q.qlen must be valid.
+
+ ---enqueue
+
+ enqueue returns 0, if packet was enqueued successfully.
+ If packet (this one or another one) was dropped, it returns
+ not zero error code.
+ NET_XMIT_DROP - this packet dropped
+ Expected action: do not backoff, but wait until queue will clear.
+ NET_XMIT_CN - probably this packet enqueued, but another one dropped.
+ Expected action: backoff or ignore
+
+ Auxiliary routines:
+
+ ---peek
+
+ like dequeue but without removing a packet from the queue
+
+ ---reset
+
+ returns qdisc to initial state: purge all buffers, clear all
+ timers, counters (except for statistics) etc.
+
+ ---init
+
+ initializes newly created qdisc.
+
+ ---destroy
+
+ destroys resources allocated by init and during lifetime of qdisc.
+
+ ---change
+
+ changes qdisc parameters.
+ */
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(qdisc_mod_lock);
+
+
+/************************************************
+ * Queueing disciplines manipulation. *
+ ************************************************/
+
+
+/* The list of all installed queueing disciplines. */
+
+static struct Qdisc_ops *qdisc_base;
+
+/* Register/unregister queueing discipline */
+
+int register_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ int rc = -EEXIST;
+
+ write_lock(&qdisc_mod_lock);
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+ if (!strcmp(qops->id, q->id))
+ goto out;
+
+ if (qops->enqueue == NULL)
+ qops->enqueue = noop_qdisc_ops.enqueue;
+ if (qops->peek == NULL) {
+ if (qops->dequeue == NULL)
+ qops->peek = noop_qdisc_ops.peek;
+ else
+ goto out_einval;
+ }
+ if (qops->dequeue == NULL)
+ qops->dequeue = noop_qdisc_ops.dequeue;
+
+ if (qops->cl_ops) {
+ const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+ if (!(cops->find && cops->walk && cops->leaf))
+ goto out_einval;
+
+ if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
+ goto out_einval;
+ }
+
+ qops->next = NULL;
+ *qp = qops;
+ rc = 0;
+out:
+ write_unlock(&qdisc_mod_lock);
+ return rc;
+
+out_einval:
+ rc = -EINVAL;
+ goto out;
+}
+EXPORT_SYMBOL(register_qdisc);
+
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ int err = -ENOENT;
+
+ write_lock(&qdisc_mod_lock);
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+ if (q == qops)
+ break;
+ if (q) {
+ *qp = q->next;
+ q->next = NULL;
+ err = 0;
+ }
+ write_unlock(&qdisc_mod_lock);
+ return err;
+}
+EXPORT_SYMBOL(unregister_qdisc);
+
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+ read_lock(&qdisc_mod_lock);
+ strlcpy(name, default_qdisc_ops->id, len);
+ read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+ struct Qdisc_ops *q = NULL;
+
+ for (q = qdisc_base; q; q = q->next) {
+ if (!strcmp(name, q->id)) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+
+ return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+ const struct Qdisc_ops *ops;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ write_lock(&qdisc_mod_lock);
+ ops = qdisc_lookup_default(name);
+ if (!ops) {
+ /* Not found, drop lock and try to load module */
+ write_unlock(&qdisc_mod_lock);
+ request_module("sch_%s", name);
+ write_lock(&qdisc_mod_lock);
+
+ ops = qdisc_lookup_default(name);
+ }
+
+ if (ops) {
+ /* Set new default */
+ module_put(default_qdisc_ops->owner);
+ default_qdisc_ops = ops;
+ }
+ write_unlock(&qdisc_mod_lock);
+
+ return ops ? 0 : -ENOENT;
+}
+
+#ifdef CONFIG_NET_SCH_DEFAULT
+/* Set default value from kernel config */
+static int __init sch_default_qdisc(void)
+{
+ return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
+}
+late_initcall(sch_default_qdisc);
+#endif
+
+/* We know handle. Find qdisc among all qdisc's attached to device
+ * (root qdisc, all its children, children of children etc.)
+ * Note: caller either uses rtnl or rcu_read_lock()
+ */
+
+static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
+{
+ struct Qdisc *q;
+
+ if (!qdisc_dev(root))
+ return (root->handle == handle ? root : NULL);
+
+ if (!(root->flags & TCQ_F_BUILTIN) &&
+ root->handle == handle)
+ return root;
+
+ hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
+ if (q->handle == handle)
+ return q;
+ }
+ return NULL;
+}
+
+void qdisc_hash_add(struct Qdisc *q, bool invisible)
+{
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ ASSERT_RTNL();
+ hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
+ if (invisible)
+ q->flags |= TCQ_F_INVISIBLE;
+ }
+}
+EXPORT_SYMBOL(qdisc_hash_add);
+
+void qdisc_hash_del(struct Qdisc *q)
+{
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ ASSERT_RTNL();
+ hash_del_rcu(&q->hash);
+ }
+}
+EXPORT_SYMBOL(qdisc_hash_del);
+
+struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+{
+ struct Qdisc *q;
+
+ if (!handle)
+ return NULL;
+ q = qdisc_match_from_root(dev->qdisc, handle);
+ if (q)
+ goto out;
+
+ if (dev_ingress_queue(dev))
+ q = qdisc_match_from_root(
+ dev_ingress_queue(dev)->qdisc_sleeping,
+ handle);
+out:
+ return q;
+}
+
+struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
+{
+ struct netdev_queue *nq;
+ struct Qdisc *q;
+
+ if (!handle)
+ return NULL;
+ q = qdisc_match_from_root(dev->qdisc, handle);
+ if (q)
+ goto out;
+
+ nq = dev_ingress_queue_rcu(dev);
+ if (nq)
+ q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+out:
+ return q;
+}
+
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+{
+ unsigned long cl;
+ struct Qdisc *leaf;
+ const struct Qdisc_class_ops *cops = p->ops->cl_ops;
+
+ if (cops == NULL)
+ return NULL;
+ cl = cops->find(p, classid);
+
+ if (cl == 0)
+ return NULL;
+ leaf = cops->leaf(p, cl);
+ return leaf;
+}
+
+/* Find queueing discipline by name */
+
+static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
+{
+ struct Qdisc_ops *q = NULL;
+
+ if (kind) {
+ read_lock(&qdisc_mod_lock);
+ for (q = qdisc_base; q; q = q->next) {
+ if (nla_strcmp(kind, q->id) == 0) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+ read_unlock(&qdisc_mod_lock);
+ }
+ return q;
+}
+
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value. The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell. If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+ int low = roundup(r->mpu, 48);
+ int high = roundup(low+1, 48);
+ int cell_low = low >> r->cell_log;
+ int cell_high = (high >> r->cell_log) - 1;
+
+ /* rtab is too inaccurate at rates > 100Mbit/s */
+ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+ pr_debug("TC linklayer: Giving up ATM detection\n");
+ return TC_LINKLAYER_ETHERNET;
+ }
+
+ if ((cell_high > cell_low) && (cell_high < 256)
+ && (rtab[cell_low] == rtab[cell_high])) {
+ pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+ cell_low, cell_high, rtab[cell_high]);
+ return TC_LINKLAYER_ATM;
+ }
+ return TC_LINKLAYER_ETHERNET;
+}
+
+static struct qdisc_rate_table *qdisc_rtab_list;
+
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
+ struct nlattr *tab,
+ struct netlink_ext_ack *extack)
+{
+ struct qdisc_rate_table *rtab;
+
+ if (tab == NULL || r->rate == 0 ||
+ r->cell_log == 0 || r->cell_log >= 32 ||
+ nla_len(tab) != TC_RTAB_SIZE) {
+ NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
+ return NULL;
+ }
+
+ for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
+ if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
+ !memcmp(&rtab->data, nla_data(tab), 1024)) {
+ rtab->refcnt++;
+ return rtab;
+ }
+ }
+
+ rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
+ if (rtab) {
+ rtab->rate = *r;
+ rtab->refcnt = 1;
+ memcpy(rtab->data, nla_data(tab), 1024);
+ if (r->linklayer == TC_LINKLAYER_UNAWARE)
+ r->linklayer = __detect_linklayer(r, rtab->data);
+ rtab->next = qdisc_rtab_list;
+ qdisc_rtab_list = rtab;
+ } else {
+ NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
+ }
+ return rtab;
+}
+EXPORT_SYMBOL(qdisc_get_rtab);
+
+void qdisc_put_rtab(struct qdisc_rate_table *tab)
+{
+ struct qdisc_rate_table *rtab, **rtabp;
+
+ if (!tab || --tab->refcnt)
+ return;
+
+ for (rtabp = &qdisc_rtab_list;
+ (rtab = *rtabp) != NULL;
+ rtabp = &rtab->next) {
+ if (rtab == tab) {
+ *rtabp = rtab->next;
+ kfree(rtab);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL(qdisc_put_rtab);
+
+static LIST_HEAD(qdisc_stab_list);
+
+static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
+ [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
+ [TCA_STAB_DATA] = { .type = NLA_BINARY },
+};
+
+static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_STAB_MAX + 1];
+ struct qdisc_size_table *stab;
+ struct tc_sizespec *s;
+ unsigned int tsize = 0;
+ u16 *tab = NULL;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (!tb[TCA_STAB_BASE]) {
+ NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
+ return ERR_PTR(-EINVAL);
+ }
+
+ s = nla_data(tb[TCA_STAB_BASE]);
+
+ if (s->tsize > 0) {
+ if (!tb[TCA_STAB_DATA]) {
+ NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
+ return ERR_PTR(-EINVAL);
+ }
+ tab = nla_data(tb[TCA_STAB_DATA]);
+ tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
+ }
+
+ if (tsize != s->tsize || (!tab && tsize > 0)) {
+ NL_SET_ERR_MSG(extack, "Invalid size of size table");
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry(stab, &qdisc_stab_list, list) {
+ if (memcmp(&stab->szopts, s, sizeof(*s)))
+ continue;
+ if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
+ continue;
+ stab->refcnt++;
+ return stab;
+ }
+
+ if (s->size_log > STAB_SIZE_LOG_MAX ||
+ s->cell_log > STAB_SIZE_LOG_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
+ return ERR_PTR(-EINVAL);
+ }
+
+ stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ stab->refcnt = 1;
+ stab->szopts = *s;
+ if (tsize > 0)
+ memcpy(stab->data, tab, tsize * sizeof(u16));
+
+ list_add_tail(&stab->list, &qdisc_stab_list);
+
+ return stab;
+}
+
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
+void qdisc_put_stab(struct qdisc_size_table *tab)
+{
+ if (!tab)
+ return;
+
+ if (--tab->refcnt == 0) {
+ list_del(&tab->list);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+ }
+}
+EXPORT_SYMBOL(qdisc_put_stab);
+
+static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_STAB);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+ const struct qdisc_size_table *stab)
+{
+ int pkt_len, slot;
+
+ pkt_len = skb->len + stab->szopts.overhead;
+ if (unlikely(!stab->szopts.tsize))
+ goto out;
+
+ slot = pkt_len + stab->szopts.cell_align;
+ if (unlikely(slot < 0))
+ slot = 0;
+
+ slot >>= stab->szopts.cell_log;
+ if (likely(slot < stab->szopts.tsize))
+ pkt_len = stab->data[slot];
+ else
+ pkt_len = stab->data[stab->szopts.tsize - 1] *
+ (slot / stab->szopts.tsize) +
+ stab->data[slot % stab->szopts.tsize];
+
+ pkt_len <<= stab->szopts.size_log;
+out:
+ if (unlikely(pkt_len < 1))
+ pkt_len = 1;
+ qdisc_skb_cb(skb)->pkt_len = pkt_len;
+}
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
+
+void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
+{
+ if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
+ qdisc->flags |= TCQ_F_WARN_NONWC;
+ }
+}
+EXPORT_SYMBOL(qdisc_warn_nonwc);
+
+static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
+{
+ struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
+ timer);
+
+ rcu_read_lock();
+ __netif_schedule(qdisc_root(wd->qdisc));
+ rcu_read_unlock();
+
+ return HRTIMER_NORESTART;
+}
+
+void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
+ clockid_t clockid)
+{
+ hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
+ wd->timer.function = qdisc_watchdog;
+ wd->qdisc = qdisc;
+}
+EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+ qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
+}
+EXPORT_SYMBOL(qdisc_watchdog_init);
+
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+{
+ if (test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(wd->qdisc)->state))
+ return;
+
+ if (wd->last_expires == expires)
+ return;
+
+ wd->last_expires = expires;
+ hrtimer_start(&wd->timer,
+ ns_to_ktime(expires),
+ HRTIMER_MODE_ABS_PINNED);
+}
+EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+
+void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
+{
+ hrtimer_cancel(&wd->timer);
+}
+EXPORT_SYMBOL(qdisc_watchdog_cancel);
+
+static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
+{
+ struct hlist_head *h;
+ unsigned int i;
+
+ h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
+
+ if (h != NULL) {
+ for (i = 0; i < n; i++)
+ INIT_HLIST_HEAD(&h[i]);
+ }
+ return h;
+}
+
+void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
+{
+ struct Qdisc_class_common *cl;
+ struct hlist_node *next;
+ struct hlist_head *nhash, *ohash;
+ unsigned int nsize, nmask, osize;
+ unsigned int i, h;
+
+ /* Rehash when load factor exceeds 0.75 */
+ if (clhash->hashelems * 4 <= clhash->hashsize * 3)
+ return;
+ nsize = clhash->hashsize * 2;
+ nmask = nsize - 1;
+ nhash = qdisc_class_hash_alloc(nsize);
+ if (nhash == NULL)
+ return;
+
+ ohash = clhash->hash;
+ osize = clhash->hashsize;
+
+ sch_tree_lock(sch);
+ for (i = 0; i < osize; i++) {
+ hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
+ h = qdisc_class_hash(cl->classid, nmask);
+ hlist_add_head(&cl->hnode, &nhash[h]);
+ }
+ }
+ clhash->hash = nhash;
+ clhash->hashsize = nsize;
+ clhash->hashmask = nmask;
+ sch_tree_unlock(sch);
+
+ kvfree(ohash);
+}
+EXPORT_SYMBOL(qdisc_class_hash_grow);
+
+int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
+{
+ unsigned int size = 4;
+
+ clhash->hash = qdisc_class_hash_alloc(size);
+ if (!clhash->hash)
+ return -ENOMEM;
+ clhash->hashsize = size;
+ clhash->hashmask = size - 1;
+ clhash->hashelems = 0;
+ return 0;
+}
+EXPORT_SYMBOL(qdisc_class_hash_init);
+
+void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
+{
+ kvfree(clhash->hash);
+}
+EXPORT_SYMBOL(qdisc_class_hash_destroy);
+
+void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
+ struct Qdisc_class_common *cl)
+{
+ unsigned int h;
+
+ INIT_HLIST_NODE(&cl->hnode);
+ h = qdisc_class_hash(cl->classid, clhash->hashmask);
+ hlist_add_head(&cl->hnode, &clhash->hash[h]);
+ clhash->hashelems++;
+}
+EXPORT_SYMBOL(qdisc_class_hash_insert);
+
+void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
+ struct Qdisc_class_common *cl)
+{
+ hlist_del(&cl->hnode);
+ clhash->hashelems--;
+}
+EXPORT_SYMBOL(qdisc_class_hash_remove);
+
+/* Allocate an unique handle from space managed by kernel
+ * Possible range is [8000-FFFF]:0000 (0x8000 values)
+ */
+static u32 qdisc_alloc_handle(struct net_device *dev)
+{
+ int i = 0x8000;
+ static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
+
+ do {
+ autohandle += TC_H_MAKE(0x10000U, 0);
+ if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
+ autohandle = TC_H_MAKE(0x80000000U, 0);
+ if (!qdisc_lookup(dev, autohandle))
+ return autohandle;
+ cond_resched();
+ } while (--i > 0);
+
+ return 0;
+}
+
+void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
+ unsigned int len)
+{
+ bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
+ const struct Qdisc_class_ops *cops;
+ unsigned long cl;
+ u32 parentid;
+ bool notify;
+ int drops;
+
+ if (n == 0 && len == 0)
+ return;
+ drops = max_t(int, n, 0);
+ rcu_read_lock();
+ while ((parentid = sch->parent)) {
+ if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
+ break;
+
+ if (sch->flags & TCQ_F_NOPARENT)
+ break;
+ /* Notify parent qdisc only if child qdisc becomes empty.
+ *
+ * If child was empty even before update then backlog
+ * counter is screwed and we skip notification because
+ * parent class is already passive.
+ *
+ * If the original child was offloaded then it is allowed
+ * to be seem as empty, so the parent is notified anyway.
+ */
+ notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
+ !qdisc_is_offloaded);
+ /* TODO: perform the search on a per txq basis */
+ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
+ if (sch == NULL) {
+ WARN_ON_ONCE(parentid != TC_H_ROOT);
+ break;
+ }
+ cops = sch->ops->cl_ops;
+ if (notify && cops->qlen_notify) {
+ cl = cops->find(sch, parentid);
+ cops->qlen_notify(sch, cl);
+ }
+ sch->q.qlen -= n;
+ sch->qstats.backlog -= len;
+ __qdisc_qstats_drop(sch, drops);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
+
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+ struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ struct qdisc_size_table *stab;
+ u32 block_index;
+ __u32 qlen;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = clid;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = refcount_read(&q->refcnt);
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (q->ops->ingress_block_get) {
+ block_index = q->ops->ingress_block_get(q);
+ if (block_index &&
+ nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
+ goto nla_put_failure;
+ }
+ if (q->ops->egress_block_get) {
+ block_index = q->ops->egress_block_get(q);
+ if (block_index &&
+ nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
+ goto nla_put_failure;
+ }
+ if (q->ops->dump && q->ops->dump(q, skb) < 0)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+ goto nla_put_failure;
+ qlen = qdisc_qlen_sum(q);
+
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ NULL, &d, TCA_PAD) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ if (qdisc_is_percpu_stats(q)) {
+ cpu_bstats = q->cpu_bstats;
+ cpu_qstats = q->cpu_qstats;
+ }
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
+ &d, cpu_bstats, &q->bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
+{
+ if (q->flags & TCQ_F_BUILTIN)
+ return true;
+ if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
+ return true;
+
+ return false;
+}
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (old && !tc_qdisc_dump_ignore(old, false)) {
+ if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
+ goto err_out;
+ }
+ if (new && !tc_qdisc_dump_ignore(new, false)) {
+ if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ goto err_out;
+ }
+
+ if (skb->len)
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static void notify_and_destroy(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new)
+{
+ if (new || old)
+ qdisc_notify(net, skb, n, clid, old, new);
+
+ if (old)
+ qdisc_put(old);
+}
+
+/* Graft qdisc "new" to class "classid" of qdisc "parent" or
+ * to device "dev".
+ *
+ * When appropriate send a netlink notification using 'skb'
+ * and "n".
+ *
+ * On success, destroy old qdisc.
+ */
+
+static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
+ struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
+ struct Qdisc *new, struct Qdisc *old,
+ struct netlink_ext_ack *extack)
+{
+ struct Qdisc *q = old;
+ struct net *net = dev_net(dev);
+ int err = 0;
+
+ if (parent == NULL) {
+ unsigned int i, num_q, ingress;
+
+ ingress = 0;
+ num_q = dev->num_tx_queues;
+ if ((q && q->flags & TCQ_F_INGRESS) ||
+ (new && new->flags & TCQ_F_INGRESS)) {
+ num_q = 1;
+ ingress = 1;
+ if (!dev_ingress_queue(dev)) {
+ NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
+ return -ENOENT;
+ }
+ }
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ if (new && new->ops->attach)
+ goto skip;
+
+ for (i = 0; i < num_q; i++) {
+ struct netdev_queue *dev_queue = dev_ingress_queue(dev);
+
+ if (!ingress)
+ dev_queue = netdev_get_tx_queue(dev, i);
+
+ old = dev_graft_qdisc(dev_queue, new);
+ if (new && i > 0)
+ qdisc_refcount_inc(new);
+
+ if (!ingress)
+ qdisc_put(old);
+ }
+
+skip:
+ if (!ingress) {
+ notify_and_destroy(net, skb, n, classid,
+ dev->qdisc, new);
+ if (new && !new->ops->attach)
+ qdisc_refcount_inc(new);
+ dev->qdisc = new ? : &noop_qdisc;
+
+ if (new && new->ops->attach)
+ new->ops->attach(new);
+ } else {
+ notify_and_destroy(net, skb, n, classid, old, new);
+ }
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+ } else {
+ const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+
+ /* Only support running class lockless if parent is lockless */
+ if (new && (new->flags & TCQ_F_NOLOCK) &&
+ parent && !(parent->flags & TCQ_F_NOLOCK))
+ new->flags &= ~TCQ_F_NOLOCK;
+
+ err = -EOPNOTSUPP;
+ if (cops && cops->graft) {
+ unsigned long cl = cops->find(parent, classid);
+
+ if (cl) {
+ err = cops->graft(parent, cl, new, &old,
+ extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ err = -ENOENT;
+ }
+ }
+ if (!err)
+ notify_and_destroy(net, skb, n, classid, old, new);
+ }
+ return err;
+}
+
+static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ u32 block_index;
+
+ if (tca[TCA_INGRESS_BLOCK]) {
+ block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
+
+ if (!block_index) {
+ NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
+ return -EINVAL;
+ }
+ if (!sch->ops->ingress_block_set) {
+ NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
+ return -EOPNOTSUPP;
+ }
+ sch->ops->ingress_block_set(sch, block_index);
+ }
+ if (tca[TCA_EGRESS_BLOCK]) {
+ block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
+
+ if (!block_index) {
+ NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
+ return -EINVAL;
+ }
+ if (!sch->ops->egress_block_set) {
+ NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
+ return -EOPNOTSUPP;
+ }
+ sch->ops->egress_block_set(sch, block_index);
+ }
+ return 0;
+}
+
+/* lockdep annotation is needed for ingress; egress gets it only for name */
+static struct lock_class_key qdisc_tx_lock;
+static struct lock_class_key qdisc_rx_lock;
+
+/*
+ Allocate and initialize new qdisc.
+
+ Parameters are passed via opt.
+ */
+
+static struct Qdisc *qdisc_create(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *p, u32 parent, u32 handle,
+ struct nlattr **tca, int *errp,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+ struct nlattr *kind = tca[TCA_KIND];
+ struct Qdisc *sch;
+ struct Qdisc_ops *ops;
+ struct qdisc_size_table *stab;
+
+ ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_MODULES
+ if (ops == NULL && kind != NULL) {
+ char name[IFNAMSIZ];
+ if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+ /* We dropped the RTNL semaphore in order to
+ * perform the module load. So, even if we
+ * succeeded in loading the module we have to
+ * tell the caller to replay the request. We
+ * indicate this using -EAGAIN.
+ * We replay the request because the device may
+ * go away in the mean time.
+ */
+ rtnl_unlock();
+ request_module("sch_%s", name);
+ rtnl_lock();
+ ops = qdisc_lookup_ops(kind);
+ if (ops != NULL) {
+ /* We will try again qdisc_lookup_ops,
+ * so don't keep a reference.
+ */
+ module_put(ops->owner);
+ err = -EAGAIN;
+ goto err_out;
+ }
+ }
+ }
+#endif
+
+ err = -ENOENT;
+ if (!ops) {
+ NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
+ goto err_out;
+ }
+
+ sch = qdisc_alloc(dev_queue, ops, extack);
+ if (IS_ERR(sch)) {
+ err = PTR_ERR(sch);
+ goto err_out2;
+ }
+
+ sch->parent = parent;
+
+ if (handle == TC_H_INGRESS) {
+ sch->flags |= TCQ_F_INGRESS;
+ handle = TC_H_MAKE(TC_H_INGRESS, 0);
+ lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
+ } else {
+ if (handle == 0) {
+ handle = qdisc_alloc_handle(dev);
+ err = -ENOMEM;
+ if (handle == 0)
+ goto err_out3;
+ }
+ lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+ if (!netif_is_multiqueue(dev))
+ sch->flags |= TCQ_F_ONETXQUEUE;
+ }
+
+ sch->handle = handle;
+
+ /* This exist to keep backward compatible with a userspace
+ * loophole, what allowed userspace to get IFF_NO_QUEUE
+ * facility on older kernels by setting tx_queue_len=0 (prior
+ * to qdisc init), and then forgot to reinit tx_queue_len
+ * before again attaching a qdisc.
+ */
+ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
+ }
+
+ err = qdisc_block_indexes_set(sch, tca, extack);
+ if (err)
+ goto err_out3;
+
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS], extack);
+ if (err != 0)
+ goto err_out5;
+ }
+
+ if (tca[TCA_STAB]) {
+ stab = qdisc_get_stab(tca[TCA_STAB], extack);
+ if (IS_ERR(stab)) {
+ err = PTR_ERR(stab);
+ goto err_out4;
+ }
+ rcu_assign_pointer(sch->stab, stab);
+ }
+ if (tca[TCA_RATE]) {
+ seqcount_t *running;
+
+ err = -EOPNOTSUPP;
+ if (sch->flags & TCQ_F_MQROOT) {
+ NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
+ goto err_out4;
+ }
+
+ if (sch->parent != TC_H_ROOT &&
+ !(sch->flags & TCQ_F_INGRESS) &&
+ (!p || !(p->flags & TCQ_F_MQROOT)))
+ running = qdisc_root_sleeping_running(sch);
+ else
+ running = &sch->running;
+
+ err = gen_new_estimator(&sch->bstats,
+ sch->cpu_bstats,
+ &sch->rate_est,
+ NULL,
+ running,
+ tca[TCA_RATE]);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
+ goto err_out4;
+ }
+ }
+
+ qdisc_hash_add(sch, false);
+
+ return sch;
+
+err_out5:
+ /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+ if (ops->destroy)
+ ops->destroy(sch);
+err_out3:
+ dev_put(dev);
+ qdisc_free(sch);
+err_out2:
+ module_put(ops->owner);
+err_out:
+ *errp = err;
+ return NULL;
+
+err_out4:
+ /*
+ * Any broken qdiscs that would require a ops->reset() here?
+ * The qdisc was never in action so it shouldn't be necessary.
+ */
+ qdisc_put_stab(rtnl_dereference(sch->stab));
+ if (ops->destroy)
+ ops->destroy(sch);
+ goto err_out3;
+}
+
+static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ struct qdisc_size_table *ostab, *stab = NULL;
+ int err = 0;
+
+ if (tca[TCA_OPTIONS]) {
+ if (!sch->ops->change) {
+ NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
+ return -EINVAL;
+ }
+ if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+ NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
+ return -EOPNOTSUPP;
+ }
+ err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
+ if (err)
+ return err;
+ }
+
+ if (tca[TCA_STAB]) {
+ stab = qdisc_get_stab(tca[TCA_STAB], extack);
+ if (IS_ERR(stab))
+ return PTR_ERR(stab);
+ }
+
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
+
+ if (tca[TCA_RATE]) {
+ /* NB: ignores errors from replace_estimator
+ because change can't be undone. */
+ if (sch->flags & TCQ_F_MQROOT)
+ goto out;
+ gen_replace_estimator(&sch->bstats,
+ sch->cpu_bstats,
+ &sch->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ }
+out:
+ return 0;
+}
+
+struct check_loop_arg {
+ struct qdisc_walker w;
+ struct Qdisc *p;
+ int depth;
+};
+
+static int check_loop_fn(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w);
+
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+ struct check_loop_arg arg;
+
+ if (q->ops->cl_ops == NULL)
+ return 0;
+
+ arg.w.stop = arg.w.skip = arg.w.count = 0;
+ arg.w.fn = check_loop_fn;
+ arg.depth = depth;
+ arg.p = p;
+ q->ops->cl_ops->walk(q, &arg.w);
+ return arg.w.stop ? -ELOOP : 0;
+}
+
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+ struct Qdisc *leaf;
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct check_loop_arg *arg = (struct check_loop_arg *)w;
+
+ leaf = cops->leaf(q, cl);
+ if (leaf) {
+ if (leaf == arg->p || arg->depth > 7)
+ return -ELOOP;
+ return check_loop(leaf, arg->p, arg->depth + 1);
+ }
+ return 0;
+}
+
+const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
+ [TCA_KIND] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [TCA_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct tc_estimator) },
+ [TCA_STAB] = { .type = NLA_NESTED },
+ [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
+ [TCA_CHAIN] = { .type = NLA_U32 },
+ [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
+ [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
+};
+
+/*
+ * Delete/get qdisc.
+ */
+
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ u32 clid;
+ struct Qdisc *q = NULL;
+ struct Qdisc *p = NULL;
+ int err;
+
+ if ((n->nlmsg_type != RTM_GETQDISC) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ clid = tcm->tcm_parent;
+ if (clid) {
+ if (clid != TC_H_ROOT) {
+ if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p) {
+ NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
+ return -ENOENT;
+ }
+ q = qdisc_leaf(p, clid);
+ } else if (dev_ingress_queue(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
+ }
+ } else {
+ q = dev->qdisc;
+ }
+ if (!q) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
+ return -ENOENT;
+ }
+
+ if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
+ NL_SET_ERR_MSG(extack, "Invalid handle");
+ return -EINVAL;
+ }
+ } else {
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q) {
+ NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
+ return -ENOENT;
+ }
+ }
+
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ return -EINVAL;
+ }
+
+ if (n->nlmsg_type == RTM_DELQDISC) {
+ if (!clid) {
+ NL_SET_ERR_MSG(extack, "Classid cannot be zero");
+ return -EINVAL;
+ }
+ if (q->handle == 0) {
+ NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
+ return -ENOENT;
+ }
+ err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
+ if (err != 0)
+ return err;
+ } else {
+ qdisc_notify(net, skb, n, clid, NULL, q);
+ }
+ return 0;
+}
+
+/*
+ * Create/change qdisc.
+ */
+
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm;
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ u32 clid;
+ struct Qdisc *q, *p;
+ int err;
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ /* Reinit, just in case something touches this. */
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ tcm = nlmsg_data(n);
+ clid = tcm->tcm_parent;
+ q = p = NULL;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+
+ if (clid) {
+ if (clid != TC_H_ROOT) {
+ if (clid != TC_H_INGRESS) {
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p) {
+ NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
+ return -ENOENT;
+ }
+ q = qdisc_leaf(p, clid);
+ } else if (dev_ingress_queue_create(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
+ }
+ } else {
+ q = dev->qdisc;
+ }
+
+ /* It may be default qdisc, ignore it */
+ if (q && q->handle == 0)
+ q = NULL;
+
+ if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+ if (tcm->tcm_handle) {
+ if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
+ return -EEXIST;
+ }
+ if (TC_H_MIN(tcm->tcm_handle)) {
+ NL_SET_ERR_MSG(extack, "Invalid minor handle");
+ return -EINVAL;
+ }
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
+ goto create_n_graft;
+ if (n->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
+ return -EEXIST;
+ }
+ if (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ return -EINVAL;
+ }
+ if (q == p ||
+ (p && check_loop(q, p, 0))) {
+ NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
+ return -ELOOP;
+ }
+ qdisc_refcount_inc(q);
+ goto graft;
+ } else {
+ if (!q)
+ goto create_n_graft;
+
+ /* This magic test requires explanation.
+ *
+ * We know, that some child q is already
+ * attached to this parent and have choice:
+ * either to change it or to create/graft new one.
+ *
+ * 1. We are allowed to create/graft only
+ * if CREATE and REPLACE flags are set.
+ *
+ * 2. If EXCL is set, requestor wanted to say,
+ * that qdisc tcm_handle is not expected
+ * to exist, so that we choose create/graft too.
+ *
+ * 3. The last case is when no flags are set.
+ * Alas, it is sort of hole in API, we
+ * cannot decide what to do unambiguously.
+ * For now we select create/graft, if
+ * user gave KIND, which does not match existing.
+ */
+ if ((n->nlmsg_flags & NLM_F_CREATE) &&
+ (n->nlmsg_flags & NLM_F_REPLACE) &&
+ ((n->nlmsg_flags & NLM_F_EXCL) ||
+ (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id))))
+ goto create_n_graft;
+ }
+ }
+ } else {
+ if (!tcm->tcm_handle) {
+ NL_SET_ERR_MSG(extack, "Handle cannot be zero");
+ return -EINVAL;
+ }
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ }
+
+ /* Change qdisc parameters */
+ if (!q) {
+ NL_SET_ERR_MSG(extack, "Specified qdisc not found");
+ return -ENOENT;
+ }
+ if (n->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
+ return -EEXIST;
+ }
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ return -EINVAL;
+ }
+ err = qdisc_change(q, tca, extack);
+ if (err == 0)
+ qdisc_notify(net, skb, n, clid, NULL, q);
+ return err;
+
+create_n_graft:
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
+ return -ENOENT;
+ }
+ if (clid == TC_H_INGRESS) {
+ if (dev_ingress_queue(dev)) {
+ q = qdisc_create(dev, dev_ingress_queue(dev), p,
+ tcm->tcm_parent, tcm->tcm_parent,
+ tca, &err, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
+ err = -ENOENT;
+ }
+ } else {
+ struct netdev_queue *dev_queue;
+
+ if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+ dev_queue = p->ops->cl_ops->select_queue(p, tcm);
+ else if (p)
+ dev_queue = p->dev_queue;
+ else
+ dev_queue = netdev_get_tx_queue(dev, 0);
+
+ q = qdisc_create(dev, dev_queue, p,
+ tcm->tcm_parent, tcm->tcm_handle,
+ tca, &err, extack);
+ }
+ if (q == NULL) {
+ if (err == -EAGAIN)
+ goto replay;
+ return err;
+ }
+
+graft:
+ err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
+ if (err) {
+ if (q)
+ qdisc_put(q);
+ return err;
+ }
+
+ return 0;
+}
+
+static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int *q_idx_p, int s_q_idx, bool recur,
+ bool dump_invisible)
+{
+ int ret = 0, q_idx = *q_idx_p;
+ struct Qdisc *q;
+ int b;
+
+ if (!root)
+ return 0;
+
+ q = root;
+ if (q_idx < s_q_idx) {
+ q_idx++;
+ } else {
+ if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
+ goto done;
+ q_idx++;
+ }
+
+ /* If dumping singletons, there is no qdisc_dev(root) and the singleton
+ * itself has already been dumped.
+ *
+ * If we've already dumped the top-level (ingress) qdisc above and the global
+ * qdisc hashtable, we don't want to hit it again
+ */
+ if (!qdisc_dev(root) || !recur)
+ goto out;
+
+ hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
+ if (q_idx < s_q_idx) {
+ q_idx++;
+ continue;
+ }
+ if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
+ goto done;
+ q_idx++;
+ }
+
+out:
+ *q_idx_p = q_idx;
+ return ret;
+done:
+ ret = -1;
+ goto out;
+}
+
+static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx, q_idx;
+ int s_idx, s_q_idx;
+ struct net_device *dev;
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct nlattr *tca[TCA_MAX + 1];
+ int err;
+
+ s_idx = cb->args[0];
+ s_q_idx = q_idx = cb->args[1];
+
+ idx = 0;
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
+ rtm_tca_policy, NULL);
+ if (err < 0)
+ return err;
+
+ for_each_netdev(net, dev) {
+ struct netdev_queue *dev_queue;
+
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ s_q_idx = 0;
+ q_idx = 0;
+
+ if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+ true, tca[TCA_DUMP_INVISIBLE]) < 0)
+ goto done;
+
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+ &q_idx, s_q_idx, false,
+ tca[TCA_DUMP_INVISIBLE]) < 0)
+ goto done;
+
+cont:
+ idx++;
+ }
+
+done:
+ cb->args[0] = idx;
+ cb->args[1] = q_idx;
+
+ return skb->len;
+}
+
+
+
+/************************************************
+ * Traffic classes manipulation. *
+ ************************************************/
+
+static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
+ unsigned long cl,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->handle;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = 0;
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ NULL, &d, TCA_PAD) < 0)
+ goto nla_put_failure;
+
+ if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, int event)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tclass_del_notify(struct net *net,
+ const struct Qdisc_class_ops *cops,
+ struct sk_buff *oskb, struct nlmsghdr *n,
+ struct Qdisc *q, unsigned long cl)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (!cops->delete)
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
+ RTM_DELTCLASS) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = cops->delete(q, cl);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+#ifdef CONFIG_NET_CLS
+
+struct tcf_bind_args {
+ struct tcf_walker w;
+ unsigned long base;
+ unsigned long cl;
+ u32 classid;
+};
+
+static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
+{
+ struct tcf_bind_args *a = (void *)arg;
+
+ if (tp->ops->bind_class) {
+ struct Qdisc *q = tcf_block_q(tp->chain->block);
+
+ sch_tree_lock(q);
+ tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
+ sch_tree_unlock(q);
+ }
+ return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ unsigned long cl;
+
+ cl = cops->find(q, portid);
+ if (!cl)
+ return;
+ if (!cops->tcf_block)
+ return;
+ block = cops->tcf_block(q, cl, NULL);
+ if (!block)
+ return;
+ list_for_each_entry(chain, &block->chain_list, list) {
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next)) {
+ struct tcf_bind_args arg = {};
+
+ arg.w.fn = tcf_node_bind;
+ arg.classid = clid;
+ arg.base = cl;
+ arg.cl = new_cl;
+ tp->ops->walk(tp, &arg.w);
+ }
+ }
+}
+
+#else
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+}
+
+#endif
+
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ struct Qdisc *q = NULL;
+ const struct Qdisc_class_ops *cops;
+ unsigned long cl = 0;
+ unsigned long new_cl;
+ u32 portid;
+ u32 clid;
+ u32 qid;
+ int err;
+
+ if ((n->nlmsg_type != RTM_GETTCLASS) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ /*
+ parent == TC_H_UNSPEC - unspecified parent.
+ parent == TC_H_ROOT - class is root, which has no parent.
+ parent == X:0 - parent is root class.
+ parent == X:Y - parent is a node in hierarchy.
+ parent == 0:Y - parent is X:Y, where X:0 is qdisc.
+
+ handle == 0:0 - generate handle from kernel pool.
+ handle == 0:Y - class is X:Y, where X:0 is qdisc.
+ handle == X:Y - clear.
+ handle == X:0 - root class.
+ */
+
+ /* Step 1. Determine qdisc handle X:0 */
+
+ portid = tcm->tcm_parent;
+ clid = tcm->tcm_handle;
+ qid = TC_H_MAJ(clid);
+
+ if (portid != TC_H_ROOT) {
+ u32 qid1 = TC_H_MAJ(portid);
+
+ if (qid && qid1) {
+ /* If both majors are known, they must be identical. */
+ if (qid != qid1)
+ return -EINVAL;
+ } else if (qid1) {
+ qid = qid1;
+ } else if (qid == 0)
+ qid = dev->qdisc->handle;
+
+ /* Now qid is genuine qdisc handle consistent
+ * both with parent and child.
+ *
+ * TC_H_MAJ(portid) still may be unspecified, complete it now.
+ */
+ if (portid)
+ portid = TC_H_MAKE(qid, portid);
+ } else {
+ if (qid == 0)
+ qid = dev->qdisc->handle;
+ }
+
+ /* OK. Locate qdisc */
+ q = qdisc_lookup(dev, qid);
+ if (!q)
+ return -ENOENT;
+
+ /* An check that it supports classes */
+ cops = q->ops->cl_ops;
+ if (cops == NULL)
+ return -EINVAL;
+
+ /* Now try to get class */
+ if (clid == 0) {
+ if (portid == TC_H_ROOT)
+ clid = qid;
+ } else
+ clid = TC_H_MAKE(qid, clid);
+
+ if (clid)
+ cl = cops->find(q, clid);
+
+ if (cl == 0) {
+ err = -ENOENT;
+ if (n->nlmsg_type != RTM_NEWTCLASS ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
+ goto out;
+ } else {
+ switch (n->nlmsg_type) {
+ case RTM_NEWTCLASS:
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+ break;
+ case RTM_DELTCLASS:
+ err = tclass_del_notify(net, cops, skb, n, q, cl);
+ /* Unbind the class with flilters with 0 */
+ tc_bind_tclass(q, portid, clid, 0);
+ goto out;
+ case RTM_GETTCLASS:
+ err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
+ goto out;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+ NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
+ return -EOPNOTSUPP;
+ }
+
+ new_cl = cl;
+ err = -EOPNOTSUPP;
+ if (cops->change)
+ err = cops->change(q, clid, portid, tca, &new_cl, extack);
+ if (err == 0) {
+ tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
+ /* We just create a new class, need to do reverse binding. */
+ if (cl != new_cl)
+ tc_bind_tclass(q, portid, clid, new_cl);
+ }
+out:
+ return err;
+}
+
+struct qdisc_dump_args {
+ struct qdisc_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+};
+
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *arg)
+{
+ struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
+
+ return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTCLASS);
+}
+
+static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
+ struct tcmsg *tcm, struct netlink_callback *cb,
+ int *t_p, int s_t)
+{
+ struct qdisc_dump_args arg;
+
+ if (tc_qdisc_dump_ignore(q, false) ||
+ *t_p < s_t || !q->ops->cl_ops ||
+ (tcm->tcm_parent &&
+ TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
+ (*t_p)++;
+ return 0;
+ }
+ if (*t_p > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ arg.w.fn = qdisc_class_dump;
+ arg.skb = skb;
+ arg.cb = cb;
+ arg.w.stop = 0;
+ arg.w.skip = cb->args[1];
+ arg.w.count = 0;
+ q->ops->cl_ops->walk(q, &arg.w);
+ cb->args[1] = arg.w.count;
+ if (arg.w.stop)
+ return -1;
+ (*t_p)++;
+ return 0;
+}
+
+static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
+ struct tcmsg *tcm, struct netlink_callback *cb,
+ int *t_p, int s_t, bool recur)
+{
+ struct Qdisc *q;
+ int b;
+
+ if (!root)
+ return 0;
+
+ if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+
+ if (!qdisc_dev(root) || !recur)
+ return 0;
+
+ if (tcm->tcm_parent) {
+ q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
+ if (q && q != root &&
+ tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+ return 0;
+ }
+ hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
+ if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct netdev_queue *dev_queue;
+ struct net_device *dev;
+ int t, s_t;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return 0;
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return 0;
+
+ s_t = cb->args[0];
+ t = 0;
+
+ if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
+ goto done;
+
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+ &t, s_t, false) < 0)
+ goto done;
+
+done:
+ cb->args[0] = t;
+
+ dev_put(dev);
+ return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+static int psched_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%08x %08x %08x %08x\n",
+ (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
+ 1000000,
+ (u32)NSEC_PER_SEC / hrtimer_resolution);
+
+ return 0;
+}
+
+static int __net_init psched_net_init(struct net *net)
+{
+ struct proc_dir_entry *e;
+
+ e = proc_create_single("psched", 0, net->proc_net, psched_show);
+ if (e == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+ remove_proc_entry("psched", net->proc_net);
+}
+#else
+static int __net_init psched_net_init(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+}
+#endif
+
+static struct pernet_operations psched_net_ops = {
+ .init = psched_net_init,
+ .exit = psched_net_exit,
+};
+
+static int __init pktsched_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&psched_net_ops);
+ if (err) {
+ pr_err("pktsched_init: "
+ "cannot initialize per netns operations\n");
+ return err;
+ }
+
+ register_qdisc(&pfifo_fast_ops);
+ register_qdisc(&pfifo_qdisc_ops);
+ register_qdisc(&bfifo_qdisc_ops);
+ register_qdisc(&pfifo_head_drop_qdisc_ops);
+ register_qdisc(&mq_qdisc_ops);
+ register_qdisc(&noqueue_qdisc_ops);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
+ 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
+ 0);
+
+ return 0;
+}
+
+subsys_initcall(pktsched_init);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
new file mode 100644
index 000000000..9a1bfa13a
--- /dev/null
+++ b/net/sched/sch_atm.c
@@ -0,0 +1,705 @@
+/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/rtnetlink.h>
+#include <linux/file.h> /* for fput */
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+/*
+ * The ATM queuing discipline provides a framework for invoking classifiers
+ * (aka "filters"), which in turn select classes of this queuing discipline.
+ * Each class maps the flow(s) it is handling to a given VC. Multiple classes
+ * may share the same VC.
+ *
+ * When creating a class, VCs are specified by passing the number of the open
+ * socket descriptor by which the calling process references the VC. The kernel
+ * keeps the VC open at least until all classes using it are removed.
+ *
+ * In this file, most functions are named atm_tc_* to avoid confusion with all
+ * the atm_* in net/atm. This naming convention differs from what's used in the
+ * rest of net/sched.
+ *
+ * Known bugs:
+ * - sometimes messes up the IP stack
+ * - any manipulations besides the few operations described in the README, are
+ * untested and likely to crash the system
+ * - should lock the flow while there is data in the queue (?)
+ */
+
+#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
+
+struct atm_flow_data {
+ struct Qdisc_class_common common;
+ struct Qdisc *q; /* FIFO, TBF, etc. */
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
+ void (*old_pop)(struct atm_vcc *vcc,
+ struct sk_buff *skb); /* chaining */
+ struct atm_qdisc_data *parent; /* parent qdisc */
+ struct socket *sock; /* for closing */
+ int ref; /* reference count */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct list_head list;
+ struct atm_flow_data *excess; /* flow for excess traffic;
+ NULL to set CLP instead */
+ int hdr_len;
+ unsigned char hdr[0]; /* header data; MUST BE LAST */
+};
+
+struct atm_qdisc_data {
+ struct atm_flow_data link; /* unclassified skbs go here */
+ struct list_head flows; /* NB: "link" is also on this
+ list */
+ struct tasklet_struct task; /* dequeue tasklet */
+};
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ list_for_each_entry(flow, &p->flows, list) {
+ if (flow->common.classid == classid)
+ return flow;
+ }
+ return NULL;
+}
+
+static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
+ sch, p, flow, new, old);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ if (!new)
+ new = &noop_qdisc;
+ *old = flow->q;
+ flow->q = new;
+ if (*old)
+ qdisc_reset(*old);
+ return 0;
+}
+
+static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
+ return flow ? flow->q : NULL;
+}
+
+static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid)
+{
+ struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
+ flow = lookup_flow(sch, classid);
+ pr_debug("%s: flow %p\n", __func__, flow);
+ return (unsigned long)flow;
+}
+
+static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
+ flow = lookup_flow(sch, classid);
+ if (flow)
+ flow->ref++;
+ pr_debug("%s: flow %p\n", __func__, flow);
+ return (unsigned long)flow;
+}
+
+/*
+ * atm_tc_put handles all destructions, including the ones that are explicitly
+ * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
+ * anything that still seems to be in use.
+ */
+static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ if (--flow->ref)
+ return;
+ pr_debug("atm_tc_put: destroying\n");
+ list_del_init(&flow->list);
+ pr_debug("atm_tc_put: qdisc %p\n", flow->q);
+ qdisc_put(flow->q);
+ tcf_block_put(flow->block);
+ if (flow->sock) {
+ pr_debug("atm_tc_put: f_count %ld\n",
+ file_count(flow->sock->file));
+ flow->vcc->pop = flow->old_pop;
+ sockfd_put(flow->sock);
+ }
+ if (flow->excess)
+ atm_tc_put(sch, (unsigned long)flow->excess);
+ if (flow != &p->link)
+ kfree(flow);
+ /*
+ * If flow == &p->link, the qdisc no longer works at this point and
+ * needs to be removed. (By the caller of atm_tc_put.)
+ */
+}
+
+static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
+
+ pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
+ VCC2FLOW(vcc)->old_pop(vcc, skb);
+ tasklet_schedule(&p->task);
+}
+
+static const u8 llc_oui_ip[] = {
+ 0xaa, /* DSAP: non-ISO */
+ 0xaa, /* SSAP: non-ISO */
+ 0x03, /* Ctrl: Unnumbered Information Command PDU */
+ 0x00, /* OUI: EtherType */
+ 0x00, 0x00,
+ 0x08, 0x00
+}; /* Ethertype IP (0800) */
+
+static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
+ [TCA_ATM_FD] = { .type = NLA_U32 },
+ [TCA_ATM_EXCESS] = { .type = NLA_U32 },
+};
+
+static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
+ struct atm_flow_data *excess = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ATM_MAX + 1];
+ struct socket *sock;
+ int fd, error, hdr_len;
+ void *hdr;
+
+ pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
+ "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
+ /*
+ * The concept of parents doesn't apply for this qdisc.
+ */
+ if (parent && parent != TC_H_ROOT && parent != sch->handle)
+ return -EINVAL;
+ /*
+ * ATM classes cannot be changed. In order to change properties of the
+ * ATM connection, that socket needs to be modified directly (via the
+ * native ATM API. In order to send a flow to a different VC, the old
+ * class needs to be removed and a new one added. (This may be changed
+ * later.)
+ */
+ if (flow)
+ return -EBUSY;
+ if (opt == NULL)
+ return -EINVAL;
+
+ error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy, NULL);
+ if (error < 0)
+ return error;
+
+ if (!tb[TCA_ATM_FD])
+ return -EINVAL;
+ fd = nla_get_u32(tb[TCA_ATM_FD]);
+ pr_debug("atm_tc_change: fd %d\n", fd);
+ if (tb[TCA_ATM_HDR]) {
+ hdr_len = nla_len(tb[TCA_ATM_HDR]);
+ hdr = nla_data(tb[TCA_ATM_HDR]);
+ } else {
+ hdr_len = RFC1483LLC_LEN;
+ hdr = NULL; /* default LLC/SNAP for IP */
+ }
+ if (!tb[TCA_ATM_EXCESS])
+ excess = NULL;
+ else {
+ excess = (struct atm_flow_data *)
+ atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
+ if (!excess)
+ return -ENOENT;
+ }
+ pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
+ opt->nla_type, nla_len(opt), hdr_len);
+ sock = sockfd_lookup(fd, &error);
+ if (!sock)
+ return error; /* f_count++ */
+ pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
+ if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
+ error = -EPROTOTYPE;
+ goto err_out;
+ }
+ /* @@@ should check if the socket is really operational or we'll crash
+ on vcc->send */
+ if (classid) {
+ if (TC_H_MAJ(classid ^ sch->handle)) {
+ pr_debug("atm_tc_change: classid mismatch\n");
+ error = -EINVAL;
+ goto err_out;
+ }
+ } else {
+ int i;
+ unsigned long cl;
+
+ for (i = 1; i < 0x8000; i++) {
+ classid = TC_H_MAKE(sch->handle, 0x8000 | i);
+ cl = atm_tc_find(sch, classid);
+ if (!cl)
+ break;
+ }
+ }
+ pr_debug("atm_tc_change: new id %x\n", classid);
+ flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
+ pr_debug("atm_tc_change: flow %p\n", flow);
+ if (!flow) {
+ error = -ENOBUFS;
+ goto err_out;
+ }
+
+ error = tcf_block_get(&flow->block, &flow->filter_list, sch,
+ extack);
+ if (error) {
+ kfree(flow);
+ goto err_out;
+ }
+
+ flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
+ extack);
+ if (!flow->q)
+ flow->q = &noop_qdisc;
+ pr_debug("atm_tc_change: qdisc %p\n", flow->q);
+ flow->sock = sock;
+ flow->vcc = ATM_SD(sock); /* speedup */
+ flow->vcc->user_back = flow;
+ pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
+ flow->old_pop = flow->vcc->pop;
+ flow->parent = p;
+ flow->vcc->pop = sch_atm_pop;
+ flow->common.classid = classid;
+ flow->ref = 1;
+ flow->excess = excess;
+ list_add(&flow->list, &p->link.list);
+ flow->hdr_len = hdr_len;
+ if (hdr)
+ memcpy(flow->hdr, hdr, hdr_len);
+ else
+ memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
+ *arg = (unsigned long)flow;
+ return 0;
+err_out:
+ sockfd_put(sock);
+ return error;
+}
+
+static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
+ return -EBUSY;
+ /*
+ * Reference count must be 2: one for "keepalive" (set at class
+ * creation), and one for the reference held when calling delete.
+ */
+ if (flow->ref < 2) {
+ pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
+ return -EINVAL;
+ }
+ if (flow->ref > 2)
+ return -EBUSY; /* catch references via excess, etc. */
+ atm_tc_put(sch, arg);
+ return 0;
+}
+
+static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+ if (walker->stop)
+ return;
+ list_for_each_entry(flow, &p->flows, list) {
+ if (walker->count >= walker->skip &&
+ walker->fn(sch, (unsigned long)flow, walker) < 0) {
+ walker->stop = 1;
+ break;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ return flow ? flow->block : p->link.block;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+ struct tcf_result res;
+ int result;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+ result = TC_ACT_OK; /* be nice to gcc */
+ flow = NULL;
+ if (TC_H_MAJ(skb->priority) != sch->handle ||
+ !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) {
+ struct tcf_proto *fl;
+
+ list_for_each_entry(flow, &p->flows, list) {
+ fl = rcu_dereference_bh(flow->filter_list);
+ if (fl) {
+ result = tcf_classify(skb, fl, &res, true);
+ if (result < 0)
+ continue;
+ flow = (struct atm_flow_data *)res.class;
+ if (!flow)
+ flow = lookup_flow(sch, res.classid);
+ goto done;
+ }
+ }
+ flow = NULL;
+done:
+ ;
+ }
+ if (!flow) {
+ flow = &p->link;
+ } else {
+ if (flow->vcc)
+ ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
+ /*@@@ looks good ... but it's not supposed to work :-) */
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ __qdisc_drop(skb, to_free);
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ __qdisc_drop(skb, to_free);
+ goto drop;
+ case TC_ACT_RECLASSIFY:
+ if (flow->excess)
+ flow = flow->excess;
+ else
+ ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
+ break;
+ }
+#endif
+ }
+
+ ret = qdisc_enqueue(skb, flow->q, to_free);
+ if (ret != NET_XMIT_SUCCESS) {
+drop: __maybe_unused
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ if (flow)
+ flow->qstats.drops++;
+ }
+ return ret;
+ }
+ /*
+ * Okay, this may seem weird. We pretend we've dropped the packet if
+ * it goes via ATM. The reason for this is that the outer qdisc
+ * expects to be able to q->dequeue the packet later on if we return
+ * success at this place. Also, sch->q.qdisc needs to reflect whether
+ * there is a packet egligible for dequeuing or not. Note that the
+ * statistics of the outer qdisc are necessarily wrong because of all
+ * this. There's currently no correct solution for this.
+ */
+ if (flow == &p->link) {
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ tasklet_schedule(&p->task);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+/*
+ * Dequeue packets and send them over ATM. Note that we quite deliberately
+ * avoid checking net_device's flow control here, simply because sch_atm
+ * uses its own channels, which have nothing to do with any CLIP/LANE/or
+ * non-ATM interfaces.
+ */
+
+static void sch_atm_dequeue(unsigned long data)
+{
+ struct Qdisc *sch = (struct Qdisc *)data;
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+ struct sk_buff *skb;
+
+ pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list) {
+ if (flow == &p->link)
+ continue;
+ /*
+ * If traffic is properly shaped, this won't generate nasty
+ * little bursts. Otherwise, it may ... (but that's okay)
+ */
+ while ((skb = flow->q->ops->peek(flow->q))) {
+ if (!atm_may_send(flow->vcc, skb->truesize))
+ break;
+
+ skb = qdisc_dequeue_peeked(flow->q);
+ if (unlikely(!skb))
+ break;
+
+ qdisc_bstats_update(sch, skb);
+ bstats_update(&flow->bstats, skb);
+ pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
+ /* remove any LL header somebody else has attached */
+ skb_pull(skb, skb_network_offset(skb));
+ if (skb_headroom(skb) < flow->hdr_len) {
+ struct sk_buff *new;
+
+ new = skb_realloc_headroom(skb, flow->hdr_len);
+ dev_kfree_skb(skb);
+ if (!new)
+ continue;
+ skb = new;
+ }
+ pr_debug("sch_atm_dequeue: ip %p, data %p\n",
+ skb_network_header(skb), skb->data);
+ ATM_SKB(skb)->vcc = flow->vcc;
+ memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
+ flow->hdr_len);
+ refcount_add(skb->truesize,
+ &sk_atm(flow->vcc)->sk_wmem_alloc);
+ /* atm.atm_options are already set by atm_tc_enqueue */
+ flow->vcc->send(flow->vcc, skb);
+ }
+ }
+}
+
+static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
+ tasklet_schedule(&p->task);
+ skb = qdisc_dequeue_peeked(p->link.q);
+ if (skb)
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
+
+ return p->link.q->ops->peek(p->link.q);
+}
+
+static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ int err;
+
+ pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+ INIT_LIST_HEAD(&p->flows);
+ INIT_LIST_HEAD(&p->link.list);
+ list_add(&p->link.list, &p->flows);
+ p->link.q = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, sch->handle, extack);
+ if (!p->link.q)
+ p->link.q = &noop_qdisc;
+ pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
+ p->link.vcc = NULL;
+ p->link.sock = NULL;
+ p->link.common.classid = sch->handle;
+ p->link.ref = 1;
+
+ err = tcf_block_get(&p->link.block, &p->link.filter_list, sch,
+ extack);
+ if (err)
+ return err;
+
+ tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
+ return 0;
+}
+
+static void atm_tc_reset(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list)
+ qdisc_reset(flow->q);
+ sch->q.qlen = 0;
+}
+
+static void atm_tc_destroy(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow, *tmp;
+
+ pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list) {
+ tcf_block_put(flow->block);
+ flow->block = NULL;
+ }
+
+ list_for_each_entry_safe(flow, tmp, &p->flows, list) {
+ if (flow->ref > 1)
+ pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
+ atm_tc_put(sch, (unsigned long)flow);
+ }
+ tasklet_kill(&p->task);
+}
+
+static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+ struct nlattr *nest;
+
+ pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
+ sch, p, flow, skb, tcm);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ tcm->tcm_handle = flow->common.classid;
+ tcm->tcm_info = flow->q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
+ goto nla_put_failure;
+ if (flow->vcc) {
+ struct sockaddr_atmpvc pvc;
+ int state;
+
+ memset(&pvc, 0, sizeof(pvc));
+ pvc.sap_family = AF_ATMPVC;
+ pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
+ pvc.sap_addr.vpi = flow->vcc->vpi;
+ pvc.sap_addr.vci = flow->vcc->vci;
+ if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
+ goto nla_put_failure;
+ state = ATM_VF2VS(flow->vcc->flags);
+ if (nla_put_u32(skb, TCA_ATM_STATE, state))
+ goto nla_put_failure;
+ }
+ if (flow->excess) {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+static int
+atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &flow->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static const struct Qdisc_class_ops atm_class_ops = {
+ .graft = atm_tc_graft,
+ .leaf = atm_tc_leaf,
+ .find = atm_tc_find,
+ .change = atm_tc_change,
+ .delete = atm_tc_delete,
+ .walk = atm_tc_walk,
+ .tcf_block = atm_tc_tcf_block,
+ .bind_tcf = atm_tc_bind_filter,
+ .unbind_tcf = atm_tc_put,
+ .dump = atm_tc_dump_class,
+ .dump_stats = atm_tc_dump_class_stats,
+};
+
+static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
+ .cl_ops = &atm_class_ops,
+ .id = "atm",
+ .priv_size = sizeof(struct atm_qdisc_data),
+ .enqueue = atm_tc_enqueue,
+ .dequeue = atm_tc_dequeue,
+ .peek = atm_tc_peek,
+ .init = atm_tc_init,
+ .reset = atm_tc_reset,
+ .destroy = atm_tc_destroy,
+ .dump = atm_tc_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init atm_init(void)
+{
+ return register_qdisc(&atm_qdisc_ops);
+}
+
+static void __exit atm_exit(void)
+{
+ unregister_qdisc(&atm_qdisc_ops);
+}
+
+module_init(atm_init)
+module_exit(atm_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000..9c4c2bb54
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,45 @@
+/*
+ * net/sched/sch_blackhole.c Black hole queue
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
+ .id = "blackhole",
+ .priv_size = 0,
+ .enqueue = blackhole_enqueue,
+ .dequeue = blackhole_dequeue,
+ .peek = blackhole_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static int __init blackhole_init(void)
+{
+ return register_qdisc(&blackhole_qdisc_ops);
+}
+device_initcall(blackhole_init)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000..18c207b85
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,3058 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* COMMON Applications Kept Enhanced (CAKE) discipline
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
+ * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
+ * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
+ * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
+ *
+ * The CAKE Principles:
+ * (or, how to have your cake and eat it too)
+ *
+ * This is a combination of several shaping, AQM and FQ techniques into one
+ * easy-to-use package:
+ *
+ * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
+ * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
+ * eliminating the need for any sort of burst parameter (eg. token bucket
+ * depth). Burst support is limited to that necessary to overcome scheduling
+ * latency.
+ *
+ * - A Diffserv-aware priority queue, giving more priority to certain classes,
+ * up to a specified fraction of bandwidth. Above that bandwidth threshold,
+ * the priority is reduced to avoid starving other tins.
+ *
+ * - Each priority tin has a separate Flow Queue system, to isolate traffic
+ * flows from each other. This prevents a burst on one flow from increasing
+ * the delay to another. Flows are distributed to queues using a
+ * set-associative hash function.
+ *
+ * - Each queue is actively managed by Cobalt, which is a combination of the
+ * Codel and Blue AQM algorithms. This serves flows fairly, and signals
+ * congestion early via ECN (if available) and/or packet drops, to keep
+ * latency low. The codel parameters are auto-tuned based on the bandwidth
+ * setting, as is necessary at low bandwidths.
+ *
+ * The configuration parameters are kept deliberately simple for ease of use.
+ * Everything has sane defaults. Complete generality of configuration is *not*
+ * a goal.
+ *
+ * The priority queue operates according to a weighted DRR scheme, combined with
+ * a bandwidth tracker which reuses the shaper logic to detect which side of the
+ * bandwidth sharing threshold the tin is operating. This determines whether a
+ * priority-based weight (high) or a bandwidth-based weight (low) is used for
+ * that tin in the current pass.
+ *
+ * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
+ * granted us permission to leverage.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
+#include <net/netlink.h>
+#include <linux/if_vlan.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tcp.h>
+#include <net/flow_dissector.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
+#define CAKE_SET_WAYS (8)
+#define CAKE_MAX_TINS (8)
+#define CAKE_QUEUES (1024)
+#define CAKE_FLOW_MASK 63
+#define CAKE_FLOW_NAT_FLAG 64
+
+/* struct cobalt_params - contains codel and blue parameters
+ * @interval: codel initial drop rate
+ * @target: maximum persistent sojourn time & blue update rate
+ * @mtu_time: serialisation delay of maximum-size packet
+ * @p_inc: increment of blue drop probability (0.32 fxp)
+ * @p_dec: decrement of blue drop probability (0.32 fxp)
+ */
+struct cobalt_params {
+ u64 interval;
+ u64 target;
+ u64 mtu_time;
+ u32 p_inc;
+ u32 p_dec;
+};
+
+/* struct cobalt_vars - contains codel and blue variables
+ * @count: codel dropping frequency
+ * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
+ * @drop_next: time to drop next packet, or when we dropped last
+ * @blue_timer: Blue time to next drop
+ * @p_drop: BLUE drop probability (0.32 fxp)
+ * @dropping: set if in dropping state
+ * @ecn_marked: set if marked
+ */
+struct cobalt_vars {
+ u32 count;
+ u32 rec_inv_sqrt;
+ ktime_t drop_next;
+ ktime_t blue_timer;
+ u32 p_drop;
+ bool dropping;
+ bool ecn_marked;
+};
+
+enum {
+ CAKE_SET_NONE = 0,
+ CAKE_SET_SPARSE,
+ CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
+ CAKE_SET_BULK,
+ CAKE_SET_DECAYING
+};
+
+struct cake_flow {
+ /* this stuff is all needed per-flow at dequeue time */
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ s32 deficit;
+ u32 dropped;
+ struct cobalt_vars cvars;
+ u16 srchost; /* index into cake_host table */
+ u16 dsthost;
+ u8 set;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct cake_host {
+ u32 srchost_tag;
+ u32 dsthost_tag;
+ u16 srchost_refcnt;
+ u16 dsthost_refcnt;
+};
+
+struct cake_heap_entry {
+ u16 t:3, b:10;
+};
+
+struct cake_tin_data {
+ struct cake_flow flows[CAKE_QUEUES];
+ u32 backlogs[CAKE_QUEUES];
+ u32 tags[CAKE_QUEUES]; /* for set association */
+ u16 overflow_idx[CAKE_QUEUES];
+ struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
+ u16 flow_quantum;
+
+ struct cobalt_params cparams;
+ u32 drop_overlimit;
+ u16 bulk_flow_count;
+ u16 sparse_flow_count;
+ u16 decaying_flow_count;
+ u16 unresponsive_flow_count;
+
+ u32 max_skblen;
+
+ struct list_head new_flows;
+ struct list_head old_flows;
+ struct list_head decaying_flows;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ ktime_t time_next_packet;
+ u64 tin_rate_ns;
+ u64 tin_rate_bps;
+ u16 tin_rate_shft;
+
+ u16 tin_quantum_prio;
+ u16 tin_quantum_band;
+ s32 tin_deficit;
+ u32 tin_backlog;
+ u32 tin_dropped;
+ u32 tin_ecn_mark;
+
+ u32 packets;
+ u64 bytes;
+
+ u32 ack_drops;
+
+ /* moving averages */
+ u64 avge_delay;
+ u64 peak_delay;
+ u64 base_delay;
+
+ /* hash function stats */
+ u32 way_directs;
+ u32 way_hits;
+ u32 way_misses;
+ u32 way_collisions;
+}; /* number of tins is small, so size of this struct doesn't matter much */
+
+struct cake_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct cake_tin_data *tins;
+
+ struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
+ u16 overflow_timeout;
+
+ u16 tin_cnt;
+ u8 tin_mode;
+ u8 flow_mode;
+ u8 ack_filter;
+ u8 atm_mode;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ u16 rate_shft;
+ ktime_t time_next_packet;
+ ktime_t failsafe_next_packet;
+ u64 rate_ns;
+ u64 rate_bps;
+ u16 rate_flags;
+ s16 rate_overhead;
+ u16 rate_mpu;
+ u64 interval;
+ u64 target;
+
+ /* resource tracking */
+ u32 buffer_used;
+ u32 buffer_max_used;
+ u32 buffer_limit;
+ u32 buffer_config_limit;
+
+ /* indices for dequeue */
+ u16 cur_tin;
+ u16 cur_flow;
+
+ struct qdisc_watchdog watchdog;
+ const u8 *tin_index;
+ const u8 *tin_order;
+
+ /* bandwidth capacity estimate */
+ ktime_t last_packet_time;
+ ktime_t avg_window_begin;
+ u64 avg_packet_interval;
+ u64 avg_window_bytes;
+ u64 avg_peak_bandwidth;
+ ktime_t last_reconfig_time;
+
+ /* packet length stats */
+ u32 avg_netoff;
+ u16 max_netlen;
+ u16 max_adjlen;
+ u16 min_netlen;
+ u16 min_adjlen;
+};
+
+enum {
+ CAKE_FLAG_OVERHEAD = BIT(0),
+ CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
+ CAKE_FLAG_INGRESS = BIT(2),
+ CAKE_FLAG_WASH = BIT(3),
+ CAKE_FLAG_SPLIT_GSO = BIT(4)
+};
+
+/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
+ * obtain the best features of each. Codel is excellent on flows which
+ * respond to congestion signals in a TCP-like way. BLUE is more effective on
+ * unresponsive flows.
+ */
+
+struct cobalt_skb_cb {
+ ktime_t enqueue_time;
+ u32 adjusted_len;
+};
+
+static u64 us_to_ns(u64 us)
+{
+ return us * NSEC_PER_USEC;
+}
+
+static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
+ return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_cobalt_cb(skb)->enqueue_time;
+}
+
+static void cobalt_set_enqueue_time(struct sk_buff *skb,
+ ktime_t now)
+{
+ get_cobalt_cb(skb)->enqueue_time = now;
+}
+
+static u16 quantum_div[CAKE_QUEUES + 1] = {0};
+
+/* Diffserv lookup tables */
+
+static const u8 precedence[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const u8 diffserv8[] = {
+ 2, 5, 1, 2, 4, 2, 2, 2,
+ 0, 2, 1, 2, 1, 2, 1, 2,
+ 5, 2, 4, 2, 4, 2, 4, 2,
+ 3, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 2, 2, 6, 2, 6, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const u8 diffserv4[] = {
+ 0, 2, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 0, 0, 3, 0, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 diffserv3[] = {
+ 0, 0, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 0, 2, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 besteffort[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* tin priority order for stats dumping */
+
+static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+static const u8 bulk_order[] = {1, 0, 2, 3};
+
+#define REC_INV_SQRT_CACHE (16)
+static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void cobalt_newton_step(struct cobalt_vars *vars)
+{
+ u32 invsqrt, invsqrt2;
+ u64 val;
+
+ invsqrt = vars->rec_inv_sqrt;
+ invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+ val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+ val >>= 2; /* avoid overflow in following multiply */
+ val = (val * invsqrt) >> (32 - 2 + 1);
+
+ vars->rec_inv_sqrt = val;
+}
+
+static void cobalt_invsqrt(struct cobalt_vars *vars)
+{
+ if (vars->count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+ else
+ cobalt_newton_step(vars);
+}
+
+/* There is a big difference in timing between the accurate values placed in
+ * the cache and the approximations given by a single Newton step for small
+ * count values, particularly when stepping from count 1 to 2 or vice versa.
+ * Above 16, a single Newton step gives sufficient accuracy in either
+ * direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give
+ * the value that *should* have been produced at count 4.
+ */
+
+static void cobalt_cache_init(void)
+{
+ struct cobalt_vars v;
+
+ memset(&v, 0, sizeof(v));
+ v.rec_inv_sqrt = ~0U;
+ cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
+
+ for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+
+ cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
+ }
+}
+
+static void cobalt_vars_init(struct cobalt_vars *vars)
+{
+ memset(vars, 0, sizeof(*vars));
+
+ if (!cobalt_rec_inv_sqrt_cache[0]) {
+ cobalt_cache_init();
+ cobalt_rec_inv_sqrt_cache[0] = ~0;
+ }
+}
+
+/* CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static ktime_t cobalt_control(ktime_t t,
+ u64 interval,
+ u32 rec_inv_sqrt)
+{
+ return ktime_add_ns(t, reciprocal_scale(interval,
+ rec_inv_sqrt));
+}
+
+/* Call this when a packet had to be dropped due to queue overflow. Returns
+ * true if the BLUE state was quiescent before but active after this call.
+ */
+static bool cobalt_queue_full(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool up = false;
+
+ if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ up = !vars->p_drop;
+ vars->p_drop += p->p_inc;
+ if (vars->p_drop < p->p_inc)
+ vars->p_drop = ~0;
+ vars->blue_timer = now;
+ }
+ vars->dropping = true;
+ vars->drop_next = now;
+ if (!vars->count)
+ vars->count = 1;
+
+ return up;
+}
+
+/* Call this when the queue was serviced but turned out to be empty. Returns
+ * true if the BLUE state was active before but quiescent after this call.
+ */
+static bool cobalt_queue_empty(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool down = false;
+
+ if (vars->p_drop &&
+ ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ if (vars->p_drop < p->p_dec)
+ vars->p_drop = 0;
+ else
+ vars->p_drop -= p->p_dec;
+ vars->blue_timer = now;
+ down = !vars->p_drop;
+ }
+ vars->dropping = false;
+
+ if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+
+ return down;
+}
+
+/* Call this with a freshly dequeued packet for possible congestion marking.
+ * Returns true as an instruction to drop the packet, false for delivery.
+ */
+static bool cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ bool next_due, over_target, drop = false;
+ ktime_t schedule;
+ u64 sojourn;
+
+/* The 'schedule' variable records, in its sign, whether 'now' is before or
+ * after 'drop_next'. This allows 'drop_next' to be updated before the next
+ * scheduling decision is actually branched, without destroying that
+ * information. Similarly, the first 'schedule' value calculated is preserved
+ * in the boolean 'next_due'.
+ *
+ * As for 'drop_next', we take advantage of the fact that 'interval' is both
+ * the delay between first exceeding 'target' and the first signalling event,
+ * *and* the scaling factor for the signalling frequency. It's therefore very
+ * natural to use a single mechanism for both purposes, and eliminates a
+ * significant amount of reference Codel's spaghetti code. To help with this,
+ * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
+ * as possible to 1.0 in fixed-point.
+ */
+
+ sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ schedule = ktime_sub(now, vars->drop_next);
+ over_target = sojourn > p->target &&
+ sojourn > p->mtu_time * bulk_flows * 2 &&
+ sojourn > p->mtu_time * 4;
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+
+ vars->ecn_marked = false;
+
+ if (over_target) {
+ if (!vars->dropping) {
+ vars->dropping = true;
+ vars->drop_next = cobalt_control(now,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+ if (!vars->count)
+ vars->count = 1;
+ } else if (vars->dropping) {
+ vars->dropping = false;
+ }
+
+ if (next_due && vars->dropping) {
+ /* Use ECN mark if possible, otherwise drop */
+ drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+
+ vars->count++;
+ if (!vars->count)
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ } else {
+ while (next_due) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ }
+ }
+
+ /* Simple BLUE implementation. Lack of ECN is deliberate. */
+ if (vars->p_drop)
+ drop |= (prandom_u32() < vars->p_drop);
+
+ /* Overload the drop_next field as an activity timeout */
+ if (!vars->count)
+ vars->drop_next = ktime_add_ns(now, p->interval);
+ else if (ktime_to_ns(schedule) > 0 && !drop)
+ vars->drop_next = now;
+
+ return drop;
+}
+
+static void cake_update_flowkeys(struct flow_keys *keys,
+ const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct nf_conntrack_tuple tuple = {};
+ bool rev = !skb->_nfct;
+
+ if (skb_protocol(skb, true) != htons(ETH_P_IP))
+ return;
+
+ if (!nf_ct_get_tuple_skb(&tuple, skb))
+ return;
+
+ keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+ keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+
+ if (keys->ports.ports) {
+ keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
+ keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+ }
+#endif
+}
+
+/* Cake has several subtle multiple bit settings. In these cases you
+ * would be matching triple isolate mode as well.
+ */
+
+static bool cake_dsrc(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
+}
+
+static bool cake_ddst(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
+}
+
+static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
+ int flow_mode, u16 flow_override, u16 host_override)
+{
+ u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
+ u16 reduced_hash, srchost_idx, dsthost_idx;
+ struct flow_keys keys, host_keys;
+
+ if (unlikely(flow_mode == CAKE_FLOW_NONE))
+ return 0;
+
+ /* If both overrides are set we can skip packet dissection entirely */
+ if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
+ (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
+ goto skip_hash;
+
+ skb_flow_dissect_flow_keys(skb, &keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ if (flow_mode & CAKE_FLOW_NAT_FLAG)
+ cake_update_flowkeys(&keys, skb);
+
+ /* flow_hash_from_keys() sorts the addresses by value, so we have
+ * to preserve their order in a separate data structure to treat
+ * src and dst host addresses as independently selectable.
+ */
+ host_keys = keys;
+ host_keys.ports.ports = 0;
+ host_keys.basic.ip_proto = 0;
+ host_keys.keyid.keyid = 0;
+ host_keys.tags.flow_label = 0;
+
+ switch (host_keys.control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ host_keys.addrs.v4addrs.src = 0;
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ host_keys.addrs.v4addrs.dst = 0;
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ memset(&host_keys.addrs.v6addrs.src, 0,
+ sizeof(host_keys.addrs.v6addrs.src));
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ memset(&host_keys.addrs.v6addrs.dst, 0,
+ sizeof(host_keys.addrs.v6addrs.dst));
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ default:
+ dsthost_hash = 0;
+ srchost_hash = 0;
+ }
+
+ /* This *must* be after the above switch, since as a
+ * side-effect it sorts the src and dst addresses.
+ */
+ if (flow_mode & CAKE_FLOW_FLOWS)
+ flow_hash = flow_hash_from_keys(&keys);
+
+skip_hash:
+ if (flow_override)
+ flow_hash = flow_override - 1;
+ if (host_override) {
+ dsthost_hash = host_override - 1;
+ srchost_hash = host_override - 1;
+ }
+
+ if (!(flow_mode & CAKE_FLOW_FLOWS)) {
+ if (flow_mode & CAKE_FLOW_SRC_IP)
+ flow_hash ^= srchost_hash;
+
+ if (flow_mode & CAKE_FLOW_DST_IP)
+ flow_hash ^= dsthost_hash;
+ }
+
+ reduced_hash = flow_hash % CAKE_QUEUES;
+
+ /* set-associative hashing */
+ /* fast path if no hash collision (direct lookup succeeds) */
+ if (likely(q->tags[reduced_hash] == flow_hash &&
+ q->flows[reduced_hash].set)) {
+ q->way_directs++;
+ } else {
+ u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
+ u32 outer_hash = reduced_hash - inner_hash;
+ bool allocate_src = false;
+ bool allocate_dst = false;
+ u32 i, k;
+
+ /* check if any active queue in the set is reserved for
+ * this flow.
+ */
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->tags[outer_hash + k] == flow_hash) {
+ if (i)
+ q->way_hits++;
+
+ if (!q->flows[outer_hash + k].set) {
+ /* need to increment host refcnts */
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ }
+
+ goto found;
+ }
+ }
+
+ /* no queue is reserved for this flow, look for an
+ * empty one.
+ */
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->flows[outer_hash + k].set) {
+ q->way_misses++;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ goto found;
+ }
+ }
+
+ /* With no empty queues, default to the original
+ * queue, accept the collision, update the host tags.
+ */
+ q->way_collisions++;
+ q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
+ q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+found:
+ /* reserve queue for future packets in same flow */
+ reduced_hash = outer_hash + k;
+ q->tags[reduced_hash] = flow_hash;
+
+ if (allocate_src) {
+ srchost_idx = srchost_hash % CAKE_QUEUES;
+ inner_hash = srchost_idx % CAKE_SET_WAYS;
+ outer_hash = srchost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].srchost_tag ==
+ srchost_hash)
+ goto found_src;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].srchost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].srchost_tag = srchost_hash;
+found_src:
+ srchost_idx = outer_hash + k;
+ q->hosts[srchost_idx].srchost_refcnt++;
+ q->flows[reduced_hash].srchost = srchost_idx;
+ }
+
+ if (allocate_dst) {
+ dsthost_idx = dsthost_hash % CAKE_QUEUES;
+ inner_hash = dsthost_idx % CAKE_SET_WAYS;
+ outer_hash = dsthost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].dsthost_tag ==
+ dsthost_hash)
+ goto found_dst;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
+found_dst:
+ dsthost_idx = outer_hash + k;
+ q->hosts[dsthost_idx].dsthost_refcnt++;
+ q->flows[reduced_hash].dsthost = dsthost_idx;
+ }
+ }
+
+ return reduced_hash;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+/* remove one skb from head of slot queue */
+
+static struct sk_buff *dequeue_head(struct cake_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ }
+
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+
+static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
+ struct ipv6hdr *buf)
+{
+ unsigned int offset = skb_network_offset(skb);
+ struct iphdr *iph;
+
+ iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
+
+ if (!iph)
+ return NULL;
+
+ if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
+ return skb_header_pointer(skb, offset + iph->ihl * 4,
+ sizeof(struct ipv6hdr), buf);
+
+ else if (iph->version == 4)
+ return iph;
+
+ else if (iph->version == 6)
+ return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
+ buf);
+
+ return NULL;
+}
+
+static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
+ void *buf, unsigned int bufsize)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct tcphdr *tcph;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
+ struct tcphdr _tcph;
+
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h)
+ return NULL;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return NULL;
+ }
+
+ } else if (ipv6h->version == 6) {
+ if (ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+ } else {
+ return NULL;
+ }
+
+ tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!tcph || tcph->doff < 5)
+ return NULL;
+
+ return skb_header_pointer(skb, offset,
+ min(__tcp_hdrlen(tcph), bufsize), buf);
+}
+
+static const void *cake_get_tcpopt(const struct tcphdr *tcph,
+ int code, int *oplen)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ if (length < 2)
+ break;
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ if (opcode == code) {
+ *oplen = opsize;
+ return ptr;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return NULL;
+}
+
+/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
+ * bytes than the other. In the case where both sequences ACKs bytes that the
+ * other doesn't, A is considered greater. DSACKs in A also makes A be
+ * considered greater.
+ *
+ * @return -1, 0 or 1 as normal compare functions
+ */
+static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
+ const struct tcphdr *tcph_b)
+{
+ const struct tcp_sack_block_wire *sack_a, *sack_b;
+ u32 ack_seq_a = ntohl(tcph_a->ack_seq);
+ u32 bytes_a = 0, bytes_b = 0;
+ int oplen_a, oplen_b;
+ bool first = true;
+
+ sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
+ sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
+
+ /* pointers point to option contents */
+ oplen_a -= TCPOLEN_SACK_BASE;
+ oplen_b -= TCPOLEN_SACK_BASE;
+
+ if (sack_a && oplen_a >= sizeof(*sack_a) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return -1;
+ else if (sack_b && oplen_b >= sizeof(*sack_b) &&
+ (!sack_a || oplen_a < sizeof(*sack_a)))
+ return 1;
+ else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return 0;
+
+ while (oplen_a >= sizeof(*sack_a)) {
+ const struct tcp_sack_block_wire *sack_tmp = sack_b;
+ u32 start_a = get_unaligned_be32(&sack_a->start_seq);
+ u32 end_a = get_unaligned_be32(&sack_a->end_seq);
+ int oplen_tmp = oplen_b;
+ bool found = false;
+
+ /* DSACK; always considered greater to prevent dropping */
+ if (before(start_a, ack_seq_a))
+ return -1;
+
+ bytes_a += end_a - start_a;
+
+ while (oplen_tmp >= sizeof(*sack_tmp)) {
+ u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
+ u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
+
+ /* first time through we count the total size */
+ if (first)
+ bytes_b += end_b - start_b;
+
+ if (!after(start_b, start_a) && !before(end_b, end_a)) {
+ found = true;
+ if (!first)
+ break;
+ }
+ oplen_tmp -= sizeof(*sack_tmp);
+ sack_tmp++;
+ }
+
+ if (!found)
+ return -1;
+
+ oplen_a -= sizeof(*sack_a);
+ sack_a++;
+ first = false;
+ }
+
+ /* If we made it this far, all ranges SACKed by A are covered by B, so
+ * either the SACKs are equal, or B SACKs more bytes.
+ */
+ return bytes_b > bytes_a ? 1 : 0;
+}
+
+static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
+ u32 *tsval, u32 *tsecr)
+{
+ const u8 *ptr;
+ int opsize;
+
+ ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
+
+ if (ptr && opsize == TCPOLEN_TIMESTAMP) {
+ *tsval = get_unaligned_be32(ptr);
+ *tsecr = get_unaligned_be32(ptr + 4);
+ }
+}
+
+static bool cake_tcph_may_drop(const struct tcphdr *tcph,
+ u32 tstamp_new, u32 tsecr_new)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+ u32 tstamp, tsecr;
+
+ /* 3 reserved flags must be unset to avoid future breakage
+ * ACK must be set
+ * ECE/CWR are handled separately
+ * All other flags URG/PSH/RST/SYN/FIN must be unset
+ * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
+ * 0x00C00000 = CWR/ECE (handled separately)
+ * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
+ */
+ if (((tcp_flag_word(tcph) &
+ cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
+ return false;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ if (length < 2)
+ break;
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ switch (opcode) {
+ case TCPOPT_MD5SIG: /* doesn't influence state */
+ break;
+
+ case TCPOPT_SACK: /* stricter checking performed later */
+ if (opsize % 8 != 2)
+ return false;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ /* only drop timestamps lower than new */
+ if (opsize != TCPOLEN_TIMESTAMP)
+ return false;
+ tstamp = get_unaligned_be32(ptr);
+ tsecr = get_unaligned_be32(ptr + 4);
+ if (after(tstamp, tstamp_new) ||
+ after(tsecr, tsecr_new))
+ return false;
+ break;
+
+ case TCPOPT_MSS: /* these should only be set on SYN */
+ case TCPOPT_WINDOW:
+ case TCPOPT_SACK_PERM:
+ case TCPOPT_FASTOPEN:
+ case TCPOPT_EXP:
+ default: /* don't drop if any unknown options are present */
+ return false;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return true;
+}
+
+static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
+ struct cake_flow *flow)
+{
+ bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
+ struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
+ struct sk_buff *skb_check, *skb_prev = NULL;
+ const struct ipv6hdr *ipv6h, *ipv6h_check;
+ unsigned char _tcph[64], _tcph_check[64];
+ const struct tcphdr *tcph, *tcph_check;
+ const struct iphdr *iph, *iph_check;
+ struct ipv6hdr _iph, _iph_check;
+ const struct sk_buff *skb;
+ int seglen, num_found = 0;
+ u32 tstamp = 0, tsecr = 0;
+ __be32 elig_flags = 0;
+ int sack_comp;
+
+ /* no other possible ACKs to filter */
+ if (flow->head == flow->tail)
+ return NULL;
+
+ skb = flow->tail;
+ tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
+ iph = cake_get_iphdr(skb, &_iph);
+ if (!tcph)
+ return NULL;
+
+ cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
+
+ /* the 'triggering' packet need only have the ACK flag set.
+ * also check that SYN is not set, as there won't be any previous ACKs.
+ */
+ if ((tcp_flag_word(tcph) &
+ (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
+ return NULL;
+
+ /* the 'triggering' ACK is at the tail of the queue, we have already
+ * returned if it is the only packet in the flow. loop through the rest
+ * of the queue looking for pure ACKs with the same 5-tuple as the
+ * triggering one.
+ */
+ for (skb_check = flow->head;
+ skb_check && skb_check != skb;
+ skb_prev = skb_check, skb_check = skb_check->next) {
+ iph_check = cake_get_iphdr(skb_check, &_iph_check);
+ tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
+ sizeof(_tcph_check));
+
+ /* only TCP packets with matching 5-tuple are eligible, and only
+ * drop safe headers
+ */
+ if (!tcph_check || iph->version != iph_check->version ||
+ tcph_check->source != tcph->source ||
+ tcph_check->dest != tcph->dest)
+ continue;
+
+ if (iph_check->version == 4) {
+ if (iph_check->saddr != iph->saddr ||
+ iph_check->daddr != iph->daddr)
+ continue;
+
+ seglen = ntohs(iph_check->tot_len) -
+ (4 * iph_check->ihl);
+ } else if (iph_check->version == 6) {
+ ipv6h = (struct ipv6hdr *)iph;
+ ipv6h_check = (struct ipv6hdr *)iph_check;
+
+ if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
+ ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
+ continue;
+
+ seglen = ntohs(ipv6h_check->payload_len);
+ } else {
+ WARN_ON(1); /* shouldn't happen */
+ continue;
+ }
+
+ /* If the ECE/CWR flags changed from the previous eligible
+ * packet in the same flow, we should no longer be dropping that
+ * previous packet as this would lose information.
+ */
+ if (elig_ack && (tcp_flag_word(tcph_check) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
+ elig_ack = NULL;
+ elig_ack_prev = NULL;
+ num_found--;
+ }
+
+ /* Check TCP options and flags, don't drop ACKs with segment
+ * data, and don't drop ACKs with a higher cumulative ACK
+ * counter than the triggering packet. Check ACK seqno here to
+ * avoid parsing SACK options of packets we are going to exclude
+ * anyway.
+ */
+ if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
+ (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
+ after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
+ continue;
+
+ /* Check SACK options. The triggering packet must SACK more data
+ * than the ACK under consideration, or SACK the same range but
+ * have a larger cumulative ACK counter. The latter is a
+ * pathological case, but is contained in the following check
+ * anyway, just to be safe.
+ */
+ sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
+
+ if (sack_comp < 0 ||
+ (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
+ sack_comp == 0))
+ continue;
+
+ /* At this point we have found an eligible pure ACK to drop; if
+ * we are in aggressive mode, we are done. Otherwise, keep
+ * searching unless this is the second eligible ACK we
+ * found.
+ *
+ * Since we want to drop ACK closest to the head of the queue,
+ * save the first eligible ACK we find, even if we need to loop
+ * again.
+ */
+ if (!elig_ack) {
+ elig_ack = skb_check;
+ elig_ack_prev = skb_prev;
+ elig_flags = (tcp_flag_word(tcph_check)
+ & (TCP_FLAG_ECE | TCP_FLAG_CWR));
+ }
+
+ if (num_found++ > 0)
+ goto found;
+ }
+
+ /* We made it through the queue without finding two eligible ACKs . If
+ * we found a single eligible ACK we can drop it in aggressive mode if
+ * we can guarantee that this does not interfere with ECN flag
+ * information. We ensure this by dropping it only if the enqueued
+ * packet is consecutive with the eligible ACK, and their flags match.
+ */
+ if (elig_ack && aggressive && elig_ack->next == skb &&
+ (elig_flags == (tcp_flag_word(tcph) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR))))
+ goto found;
+
+ return NULL;
+
+found:
+ if (elig_ack_prev)
+ elig_ack_prev->next = elig_ack->next;
+ else
+ flow->head = elig_ack->next;
+
+ elig_ack->next = NULL;
+
+ return elig_ack;
+}
+
+static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
+{
+ avg -= avg >> shift;
+ avg += sample >> shift;
+ return avg;
+}
+
+static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
+{
+ if (q->rate_flags & CAKE_FLAG_OVERHEAD)
+ len -= off;
+
+ if (q->max_netlen < len)
+ q->max_netlen = len;
+ if (q->min_netlen > len)
+ q->min_netlen = len;
+
+ len += q->rate_overhead;
+
+ if (len < q->rate_mpu)
+ len = q->rate_mpu;
+
+ if (q->atm_mode == CAKE_ATM_ATM) {
+ len += 47;
+ len /= 48;
+ len *= 53;
+ } else if (q->atm_mode == CAKE_ATM_PTM) {
+ /* Add one byte per 64 bytes or part thereof.
+ * This is conservative and easier to calculate than the
+ * precise value.
+ */
+ len += (len + 63) / 64;
+ }
+
+ if (q->max_adjlen < len)
+ q->max_adjlen = len;
+ if (q->min_adjlen > len)
+ q->min_adjlen = len;
+
+ return len;
+}
+
+static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int hdr_len, last_len = 0;
+ u32 off = skb_network_offset(skb);
+ u32 len = qdisc_pkt_len(skb);
+ u16 segs = 1;
+
+ q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
+
+ if (!shinfo->gso_size)
+ return cake_calc_overhead(q, len, off);
+
+ /* borrowed from qdisc_pkt_len_init() */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
+ segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+ else
+ segs = shinfo->gso_segs;
+
+ len = shinfo->gso_size + hdr_len;
+ last_len = skb->len - shinfo->gso_size * (segs - 1);
+
+ return (cake_calc_overhead(q, len, off) * (segs - 1) +
+ cake_calc_overhead(q, last_len, off));
+}
+
+static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+ struct cake_heap_entry jj = q->overflow_heap[j];
+
+ q->overflow_heap[i] = jj;
+ q->overflow_heap[j] = ii;
+
+ q->tins[ii.t].overflow_idx[ii.b] = j;
+ q->tins[jj.t].overflow_idx[jj.b] = i;
+}
+
+static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+
+ return q->tins[ii.t].backlogs[ii.b];
+}
+
+static void cake_heapify(struct cake_sched_data *q, u16 i)
+{
+ static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
+ u32 mb = cake_heap_get_backlog(q, i);
+ u32 m = i;
+
+ while (m < a) {
+ u32 l = m + m + 1;
+ u32 r = l + 1;
+
+ if (l < a) {
+ u32 lb = cake_heap_get_backlog(q, l);
+
+ if (lb > mb) {
+ m = l;
+ mb = lb;
+ }
+ }
+
+ if (r < a) {
+ u32 rb = cake_heap_get_backlog(q, r);
+
+ if (rb > mb) {
+ m = r;
+ mb = rb;
+ }
+ }
+
+ if (m != i) {
+ cake_heap_swap(q, i, m);
+ i = m;
+ } else {
+ break;
+ }
+ }
+}
+
+static void cake_heapify_up(struct cake_sched_data *q, u16 i)
+{
+ while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
+ u16 p = (i - 1) >> 1;
+ u32 ib = cake_heap_get_backlog(q, i);
+ u32 pb = cake_heap_get_backlog(q, p);
+
+ if (ib > pb) {
+ cake_heap_swap(q, i, p);
+ i = p;
+ } else {
+ break;
+ }
+ }
+}
+
+static int cake_advance_shaper(struct cake_sched_data *q,
+ struct cake_tin_data *b,
+ struct sk_buff *skb,
+ ktime_t now, bool drop)
+{
+ u32 len = get_cobalt_cb(skb)->adjusted_len;
+
+ /* charge packet bandwidth to this tin
+ * and to the global shaper.
+ */
+ if (q->rate_ns) {
+ u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
+ u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
+ u64 failsafe_dur = global_dur + (global_dur >> 1);
+
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = ktime_add_ns(b->time_next_packet,
+ tin_dur);
+
+ else if (ktime_before(b->time_next_packet,
+ ktime_add_ns(now, tin_dur)))
+ b->time_next_packet = ktime_add_ns(now, tin_dur);
+
+ q->time_next_packet = ktime_add_ns(q->time_next_packet,
+ global_dur);
+ if (!drop)
+ q->failsafe_next_packet = \
+ ktime_add_ns(q->failsafe_next_packet,
+ failsafe_dur);
+ }
+ return len;
+}
+
+static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ ktime_t now = ktime_get();
+ u32 idx = 0, tin = 0, len;
+ struct cake_heap_entry qq;
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ struct sk_buff *skb;
+
+ if (!q->overflow_timeout) {
+ int i;
+ /* Build fresh max-heap */
+ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+ cake_heapify(q, i);
+ }
+ q->overflow_timeout = 65535;
+
+ /* select longest queue for pruning */
+ qq = q->overflow_heap[0];
+ tin = qq.t;
+ idx = qq.b;
+
+ b = &q->tins[tin];
+ flow = &b->flows[idx];
+ skb = dequeue_head(flow);
+ if (unlikely(!skb)) {
+ /* heap has gone wrong, rebuild it next time */
+ q->overflow_timeout = 0;
+ return idx + (tin << 16);
+ }
+
+ if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count++;
+
+ len = qdisc_pkt_len(skb);
+ q->buffer_used -= skb->truesize;
+ b->backlogs[idx] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ qdisc_tree_reduce_backlog(sch, 1, len);
+
+ flow->dropped++;
+ b->tin_dropped++;
+ sch->qstats.drops++;
+
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, skb, now, true);
+
+ __qdisc_drop(skb, to_free);
+ sch->q.qlen--;
+
+ cake_heapify(q, 0);
+
+ return idx + (tin << 16);
+}
+
+static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
+{
+ const int offset = skb_network_offset(skb);
+ u16 *buf, buf_;
+ u8 dscp;
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
+ if (unlikely(!buf))
+ return 0;
+
+ /* ToS is in the second byte of iphdr */
+ dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
+
+ if (wash && dscp) {
+ const int wlen = offset + sizeof(struct iphdr);
+
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ }
+
+ return dscp;
+
+ case htons(ETH_P_IPV6):
+ buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
+ if (unlikely(!buf))
+ return 0;
+
+ /* Traffic class is in the first and second bytes of ipv6hdr */
+ dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
+
+ if (wash && dscp) {
+ const int wlen = offset + sizeof(struct ipv6hdr);
+
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ }
+
+ return dscp;
+
+ case htons(ETH_P_ARP):
+ return 0x38; /* CS7 - Net Control */
+
+ default:
+ /* If there is no Diffserv field, treat as best-effort */
+ return 0;
+ }
+}
+
+static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
+ struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 tin;
+ bool wash;
+ u8 dscp;
+
+ /* Tin selection: Default to diffserv-based selection, allow overriding
+ * using firewall marks or skb->priority. Call DSCP parsing early if
+ * wash is enabled, otherwise defer to below to skip unneeded parsing.
+ */
+ wash = !!(q->rate_flags & CAKE_FLAG_WASH);
+ if (wash)
+ dscp = cake_handle_diffserv(skb, wash);
+
+ if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
+ tin = 0;
+
+ else if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->tin_cnt)
+ tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
+
+ else {
+ if (!wash)
+ dscp = cake_handle_diffserv(skb, wash);
+ tin = q->tin_index[dscp];
+
+ if (unlikely(tin >= q->tin_cnt))
+ tin = 0;
+ }
+
+ return &q->tins[tin];
+}
+
+static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
+ struct sk_buff *skb, int flow_mode, int *qerr)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ u16 flow = 0, host = 0;
+ int result;
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ goto hash;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
+ flow = TC_H_MIN(res.classid);
+ if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
+ host = TC_H_MAJ(res.classid) >> 16;
+ }
+hash:
+ *t = cake_select_tin(sch, skb);
+ return cake_hash(*t, skb, flow_mode, flow, host) + 1;
+}
+
+static void cake_reconfigure(struct Qdisc *sch);
+
+static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int len = qdisc_pkt_len(skb);
+ int uninitialized_var(ret);
+ struct sk_buff *ack = NULL;
+ ktime_t now = ktime_get();
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ u32 idx;
+
+ /* choose flow to insert into */
+ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+ flow = &b->flows[idx];
+
+ /* ensure shaper state isn't stale */
+ if (!b->tin_backlog) {
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = now;
+
+ if (!sch->q.qlen) {
+ if (ktime_before(q->time_next_packet, now)) {
+ q->failsafe_next_packet = now;
+ q->time_next_packet = now;
+ } else if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = \
+ min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(
+ q->failsafe_next_packet));
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ }
+ }
+ }
+
+ if (unlikely(len > b->max_skblen))
+ b->max_skblen = len;
+
+ if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ unsigned int slen = 0, numsegs = 0;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ cobalt_set_enqueue_time(segs, now);
+ get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
+ segs);
+ flow_queue_add(flow, segs);
+
+ sch->q.qlen++;
+ numsegs++;
+ slen += segs->len;
+ q->buffer_used += segs->truesize;
+ b->packets++;
+ segs = nskb;
+ }
+
+ /* stats */
+ b->bytes += slen;
+ b->backlogs[idx] += slen;
+ b->tin_backlog += slen;
+ sch->qstats.backlog += slen;
+ q->avg_window_bytes += slen;
+
+ qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
+ consume_skb(skb);
+ } else {
+ /* not splitting */
+ cobalt_set_enqueue_time(skb, now);
+ get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
+ flow_queue_add(flow, skb);
+
+ if (q->ack_filter)
+ ack = cake_ack_filter(q, flow);
+
+ if (ack) {
+ b->ack_drops++;
+ sch->qstats.drops++;
+ b->bytes += qdisc_pkt_len(ack);
+ len -= qdisc_pkt_len(ack);
+ q->buffer_used += skb->truesize - ack->truesize;
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, ack, now, true);
+
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
+ consume_skb(ack);
+ } else {
+ sch->q.qlen++;
+ q->buffer_used += skb->truesize;
+ }
+
+ /* stats */
+ b->packets++;
+ b->bytes += len;
+ b->backlogs[idx] += len;
+ b->tin_backlog += len;
+ sch->qstats.backlog += len;
+ q->avg_window_bytes += len;
+ }
+
+ if (q->overflow_timeout)
+ cake_heapify_up(q, b->overflow_idx[idx]);
+
+ /* incoming bandwidth capacity estimate */
+ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
+ u64 packet_interval = \
+ ktime_to_ns(ktime_sub(now, q->last_packet_time));
+
+ if (packet_interval > NSEC_PER_SEC)
+ packet_interval = NSEC_PER_SEC;
+
+ /* filter out short-term bursts, eg. wifi aggregation */
+ q->avg_packet_interval = \
+ cake_ewma(q->avg_packet_interval,
+ packet_interval,
+ (packet_interval > q->avg_packet_interval ?
+ 2 : 8));
+
+ q->last_packet_time = now;
+
+ if (packet_interval > q->avg_packet_interval) {
+ u64 window_interval = \
+ ktime_to_ns(ktime_sub(now,
+ q->avg_window_begin));
+ u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
+
+ b = div64_u64(b, window_interval);
+ q->avg_peak_bandwidth =
+ cake_ewma(q->avg_peak_bandwidth, b,
+ b > q->avg_peak_bandwidth ? 2 : 8);
+ q->avg_window_bytes = 0;
+ q->avg_window_begin = now;
+
+ if (ktime_after(now,
+ ktime_add_ms(q->last_reconfig_time,
+ 250))) {
+ q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
+ cake_reconfigure(sch);
+ }
+ }
+ } else {
+ q->avg_window_bytes = 0;
+ q->last_packet_time = now;
+ }
+
+ /* flowchain */
+ if (!flow->set || flow->set == CAKE_SET_DECAYING) {
+ struct cake_host *srchost = &b->hosts[flow->srchost];
+ struct cake_host *dsthost = &b->hosts[flow->dsthost];
+ u16 host_load = 1;
+
+ if (!flow->set) {
+ list_add_tail(&flow->flowchain, &b->new_flows);
+ } else {
+ b->decaying_flow_count--;
+ list_move_tail(&flow->flowchain, &b->new_flows);
+ }
+ flow->set = CAKE_SET_SPARSE;
+ b->sparse_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ flow->deficit = (b->flow_quantum *
+ quantum_div[host_load]) >> 16;
+ } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+ /* this flow was empty, accounted as a sparse flow, but actually
+ * in the bulk rotation.
+ */
+ flow->set = CAKE_SET_BULK;
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ }
+
+ if (q->buffer_used > q->buffer_max_used)
+ q->buffer_max_used = q->buffer_used;
+
+ if (q->buffer_used > q->buffer_limit) {
+ u32 dropped = 0;
+
+ while (q->buffer_used > q->buffer_limit) {
+ dropped++;
+ cake_drop(sch, to_free);
+ }
+ b->drop_overlimit += dropped;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_flow *flow = &b->flows[q->cur_flow];
+ struct sk_buff *skb = NULL;
+ u32 len;
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ b->backlogs[q->cur_flow] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ q->buffer_used -= skb->truesize;
+ sch->q.qlen--;
+
+ if (q->overflow_timeout)
+ cake_heapify(q, b->overflow_idx[q->cur_flow]);
+ }
+ return skb;
+}
+
+/* Discard leftover packets from a tin no longer in use. */
+static void cake_clear_tin(struct Qdisc *sch, u16 tin)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ q->cur_tin = tin;
+ for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
+ while (!!(skb = cake_dequeue_one(sch)))
+ kfree_skb(skb);
+}
+
+static struct sk_buff *cake_dequeue(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_host *srchost, *dsthost;
+ ktime_t now = ktime_get();
+ struct cake_flow *flow;
+ struct list_head *head;
+ bool first_flow = true;
+ struct sk_buff *skb;
+ u16 host_load;
+ u64 delay;
+ u32 len;
+
+begin:
+ if (!sch->q.qlen)
+ return NULL;
+
+ /* global hard shaper */
+ if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ return NULL;
+ }
+
+ /* Choose a class to work on. */
+ if (!q->rate_ns) {
+ /* In unlimited mode, can't rely on shaper timings, just balance
+ * with DRR
+ */
+ bool wrapped = false, empty = true;
+
+ while (b->tin_deficit < 0 ||
+ !(b->sparse_flow_count + b->bulk_flow_count)) {
+ if (b->tin_deficit <= 0)
+ b->tin_deficit += b->tin_quantum_band;
+ if (b->sparse_flow_count + b->bulk_flow_count)
+ empty = false;
+
+ q->cur_tin++;
+ b++;
+ if (q->cur_tin >= q->tin_cnt) {
+ q->cur_tin = 0;
+ b = q->tins;
+
+ if (wrapped) {
+ /* It's possible for q->qlen to be
+ * nonzero when we actually have no
+ * packets anywhere.
+ */
+ if (empty)
+ return NULL;
+ } else {
+ wrapped = true;
+ }
+ }
+ }
+ } else {
+ /* In shaped mode, choose:
+ * - Highest-priority tin with queue and meeting schedule, or
+ * - The earliest-scheduled tin with queue.
+ */
+ ktime_t best_time = KTIME_MAX;
+ int tin, best_tin = 0;
+
+ for (tin = 0; tin < q->tin_cnt; tin++) {
+ b = q->tins + tin;
+ if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
+ ktime_t time_to_pkt = \
+ ktime_sub(b->time_next_packet, now);
+
+ if (ktime_to_ns(time_to_pkt) <= 0 ||
+ ktime_compare(time_to_pkt,
+ best_time) <= 0) {
+ best_time = time_to_pkt;
+ best_tin = tin;
+ }
+ }
+ }
+
+ q->cur_tin = best_tin;
+ b = q->tins + best_tin;
+
+ /* No point in going further if no packets to deliver. */
+ if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
+ return NULL;
+ }
+
+retry:
+ /* service this class */
+ head = &b->decaying_flows;
+ if (!first_flow || list_empty(head)) {
+ head = &b->new_flows;
+ if (list_empty(head)) {
+ head = &b->old_flows;
+ if (unlikely(list_empty(head))) {
+ head = &b->decaying_flows;
+ if (unlikely(list_empty(head)))
+ goto begin;
+ }
+ }
+ }
+ flow = list_first_entry(head, struct cake_flow, flowchain);
+ q->cur_flow = flow - b->flows;
+ first_flow = false;
+
+ /* triple isolation (modified DRR++) */
+ srchost = &b->hosts[flow->srchost];
+ dsthost = &b->hosts[flow->dsthost];
+ host_load = 1;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ WARN_ON(host_load > CAKE_QUEUES);
+
+ /* flow isolation (DRR++) */
+ if (flow->deficit <= 0) {
+ /* The shifted prandom_u32() is a way to apply dithering to
+ * avoid accumulating roundoff errors
+ */
+ flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+ (prandom_u32() >> 16)) >> 16;
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
+ /* Keep all flows with deficits out of the sparse and decaying
+ * rotations. No non-empty flow can go into the decaying
+ * rotation, so they can't get deficits
+ */
+ if (flow->set == CAKE_SET_SPARSE) {
+ if (flow->head) {
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ flow->set = CAKE_SET_BULK;
+ } else {
+ /* we've moved it to the bulk rotation for
+ * correct deficit accounting but we still want
+ * to count it as a sparse flow, not a bulk one.
+ */
+ flow->set = CAKE_SET_SPARSE_WAIT;
+ }
+ }
+ goto retry;
+ }
+
+ /* Retrieve a packet via the AQM */
+ while (1) {
+ skb = cake_dequeue_one(sch);
+ if (!skb) {
+ /* this queue was actually empty */
+ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count--;
+
+ if (flow->cvars.p_drop || flow->cvars.count ||
+ ktime_before(now, flow->cvars.drop_next)) {
+ /* keep in the flowchain until the state has
+ * decayed to rest
+ */
+ list_move_tail(&flow->flowchain,
+ &b->decaying_flows);
+ if (flow->set == CAKE_SET_BULK) {
+ b->bulk_flow_count--;
+ b->decaying_flow_count++;
+ } else if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT) {
+ b->sparse_flow_count--;
+ b->decaying_flow_count++;
+ }
+ flow->set = CAKE_SET_DECAYING;
+ } else {
+ /* remove empty queue from the flowchain */
+ list_del_init(&flow->flowchain);
+ if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT)
+ b->sparse_flow_count--;
+ else if (flow->set == CAKE_SET_BULK)
+ b->bulk_flow_count--;
+ else
+ b->decaying_flow_count--;
+
+ flow->set = CAKE_SET_NONE;
+ srchost->srchost_refcnt--;
+ dsthost->dsthost_refcnt--;
+ }
+ goto begin;
+ }
+
+ /* Last packet in queue may be marked, shouldn't be dropped */
+ if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags &
+ CAKE_FLAG_INGRESS))) ||
+ !flow->head)
+ break;
+
+ /* drop this packet, get another one */
+ if (q->rate_flags & CAKE_FLAG_INGRESS) {
+ len = cake_advance_shaper(q, b, skb,
+ now, true);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+ }
+ flow->dropped++;
+ b->tin_dropped++;
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ goto retry;
+ }
+
+ b->tin_ecn_mark += !!flow->cvars.ecn_marked;
+ qdisc_bstats_update(sch, skb);
+
+ /* collect delay stats */
+ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
+ b->peak_delay = cake_ewma(b->peak_delay, delay,
+ delay > b->peak_delay ? 2 : 8);
+ b->base_delay = cake_ewma(b->base_delay, delay,
+ delay < b->base_delay ? 2 : 8);
+
+ len = cake_advance_shaper(q, b, skb, now, false);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+
+ if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ } else if (!sch->q.qlen) {
+ int i;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ if (q->tins[i].decaying_flow_count) {
+ ktime_t next = \
+ ktime_add_ns(now,
+ q->tins[i].cparams.target);
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ ktime_to_ns(next));
+ break;
+ }
+ }
+ }
+
+ if (q->overflow_timeout)
+ q->overflow_timeout--;
+
+ return skb;
+}
+
+static void cake_reset(struct Qdisc *sch)
+{
+ u32 c;
+
+ for (c = 0; c < CAKE_MAX_TINS; c++)
+ cake_clear_tin(sch, c);
+}
+
+static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
+ [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
+ [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_ATM] = { .type = NLA_U32 },
+ [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
+ [TCA_CAKE_RTT] = { .type = NLA_U32 },
+ [TCA_CAKE_TARGET] = { .type = NLA_U32 },
+ [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
+ [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
+ [TCA_CAKE_NAT] = { .type = NLA_U32 },
+ [TCA_CAKE_RAW] = { .type = NLA_U32 },
+ [TCA_CAKE_WASH] = { .type = NLA_U32 },
+ [TCA_CAKE_MPU] = { .type = NLA_U32 },
+ [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
+ [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+};
+
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+ u64 target_ns, u64 rtt_est_ns)
+{
+ /* convert byte-rate into time-per-byte
+ * so it will always unwedge in reasonable time.
+ */
+ static const u64 MIN_RATE = 64;
+ u32 byte_target = mtu;
+ u64 byte_target_ns;
+ u8 rate_shft = 0;
+ u64 rate_ns = 0;
+
+ b->flow_quantum = 1514;
+ if (rate) {
+ b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
+ rate_shft = 34;
+ rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
+ rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
+ while (!!(rate_ns >> 34)) {
+ rate_ns >>= 1;
+ rate_shft--;
+ }
+ } /* else unlimited, ie. zero delay */
+
+ b->tin_rate_bps = rate;
+ b->tin_rate_ns = rate_ns;
+ b->tin_rate_shft = rate_shft;
+
+ byte_target_ns = (byte_target * rate_ns) >> rate_shft;
+
+ b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
+ b->cparams.interval = max(rtt_est_ns +
+ b->cparams.target - target_ns,
+ b->cparams.target * 2);
+ b->cparams.mtu_time = byte_target_ns;
+ b->cparams.p_inc = 1 << 24; /* 1/256 */
+ b->cparams.p_dec = 1 << 20; /* 1/4096 */
+}
+
+static int cake_config_besteffort(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[0];
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+
+ q->tin_cnt = 1;
+
+ q->tin_index = besteffort;
+ q->tin_order = normal_order;
+
+ cake_set_rate(b, rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ b->tin_quantum_band = 65535;
+ b->tin_quantum_prio = 65535;
+
+ return 0;
+}
+
+static int cake_config_precedence(struct Qdisc *sch)
+{
+ /* convert high-level (user visible) parameters into internal format */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+ q->tin_index = precedence;
+ q->tin_order = normal_order;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+/* List of known Diffserv codepoints:
+ *
+ * Least Effort (CS1)
+ * Best Effort (CS0)
+ * Max Reliability & LLT "Lo" (TOS1)
+ * Max Throughput (TOS2)
+ * Min Delay (TOS4)
+ * LLT "La" (TOS5)
+ * Assured Forwarding 1 (AF1x) - x3
+ * Assured Forwarding 2 (AF2x) - x3
+ * Assured Forwarding 3 (AF3x) - x3
+ * Assured Forwarding 4 (AF4x) - x3
+ * Precedence Class 2 (CS2)
+ * Precedence Class 3 (CS3)
+ * Precedence Class 4 (CS4)
+ * Precedence Class 5 (CS5)
+ * Precedence Class 6 (CS6)
+ * Precedence Class 7 (CS7)
+ * Voice Admit (VA)
+ * Expedited Forwarding (EF)
+
+ * Total 25 codepoints.
+ */
+
+/* List of traffic classes in RFC 4594:
+ * (roughly descending order of contended priority)
+ * (roughly ascending order of uncontended throughput)
+ *
+ * Network Control (CS6,CS7) - routing traffic
+ * Telephony (EF,VA) - aka. VoIP streams
+ * Signalling (CS5) - VoIP setup
+ * Multimedia Conferencing (AF4x) - aka. video calls
+ * Realtime Interactive (CS4) - eg. games
+ * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
+ * Broadcast Video (CS3)
+ * Low Latency Data (AF2x,TOS4) - eg. database
+ * Ops, Admin, Management (CS2,TOS1) - eg. ssh
+ * Standard Service (CS0 & unrecognised codepoints)
+ * High Throughput Data (AF1x,TOS2) - eg. web traffic
+ * Low Priority Data (CS1) - eg. BitTorrent
+
+ * Total 12 traffic classes.
+ */
+
+static int cake_config_diffserv8(struct Qdisc *sch)
+{
+/* Pruned list of traffic classes for typical applications:
+ *
+ * Network Control (CS6, CS7)
+ * Minimum Latency (EF, VA, CS5, CS4)
+ * Interactive Shell (CS2, TOS1)
+ * Low Latency Transactions (AF2x, TOS4)
+ * Video Streaming (AF4x, AF3x, CS3)
+ * Bog Standard (CS0 etc.)
+ * High Throughput (AF1x, TOS2)
+ * Background Traffic (CS1)
+ *
+ * Total 8 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv8;
+ q->tin_order = normal_order;
+
+ /* class characteristics */
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+static int cake_config_diffserv4(struct Qdisc *sch)
+{
+/* Further pruned list of traffic classes for four-class system:
+ *
+ * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
+ * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
+ * Best Effort (CS0, AF1x, TOS2, and those not specified)
+ * Background Traffic (CS1)
+ *
+ * Total 4 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 4;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv4;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 1, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[3], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 2;
+ q->tins[3].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 1;
+ q->tins[3].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static int cake_config_diffserv3(struct Qdisc *sch)
+{
+/* Simplified Diffserv structure with 3 tins.
+ * Low Priority (CS1)
+ * Best Effort
+ * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
+ */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 3;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv3;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int c, ft;
+
+ switch (q->tin_mode) {
+ case CAKE_DIFFSERV_BESTEFFORT:
+ ft = cake_config_besteffort(sch);
+ break;
+
+ case CAKE_DIFFSERV_PRECEDENCE:
+ ft = cake_config_precedence(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV8:
+ ft = cake_config_diffserv8(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV4:
+ ft = cake_config_diffserv4(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV3:
+ default:
+ ft = cake_config_diffserv3(sch);
+ break;
+ }
+
+ for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+ cake_clear_tin(sch, c);
+ q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+ }
+
+ q->rate_ns = q->tins[ft].tin_rate_ns;
+ q->rate_shft = q->tins[ft].tin_rate_shft;
+
+ if (q->buffer_config_limit) {
+ q->buffer_limit = q->buffer_config_limit;
+ } else if (q->rate_bps) {
+ u64 t = q->rate_bps * q->interval;
+
+ do_div(t, USEC_PER_SEC / 4);
+ q->buffer_limit = max_t(u32, t, 4U << 20);
+ } else {
+ q->buffer_limit = ~0;
+ }
+
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ q->buffer_limit = min(q->buffer_limit,
+ max(sch->limit * psched_mtu(qdisc_dev(sch)),
+ q->buffer_config_limit));
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CAKE_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CAKE_NAT]) {
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+ q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+ !!nla_get_u32(tb[TCA_CAKE_NAT]);
+#else
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
+ "No conntrack support in kernel");
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ if (tb[TCA_CAKE_BASE_RATE64])
+ q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
+
+ if (tb[TCA_CAKE_DIFFSERV_MODE])
+ q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+
+ if (tb[TCA_CAKE_WASH]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
+ q->rate_flags |= CAKE_FLAG_WASH;
+ else
+ q->rate_flags &= ~CAKE_FLAG_WASH;
+ }
+
+ if (tb[TCA_CAKE_FLOW_MODE])
+ q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
+ (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
+ CAKE_FLOW_MASK));
+
+ if (tb[TCA_CAKE_ATM])
+ q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
+
+ if (tb[TCA_CAKE_OVERHEAD]) {
+ q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
+ q->rate_flags |= CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_RAW]) {
+ q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_MPU])
+ q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
+
+ if (tb[TCA_CAKE_RTT]) {
+ q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+
+ if (!q->interval)
+ q->interval = 1;
+ }
+
+ if (tb[TCA_CAKE_TARGET]) {
+ q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+
+ if (!q->target)
+ q->target = 1;
+ }
+
+ if (tb[TCA_CAKE_AUTORATE]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
+ q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_INGRESS]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
+ q->rate_flags |= CAKE_FLAG_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_ACK_FILTER])
+ q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
+
+ if (tb[TCA_CAKE_MEMORY])
+ q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+
+ if (tb[TCA_CAKE_SPLIT_GSO]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ }
+
+ if (q->tins) {
+ sch_tree_lock(sch);
+ cake_reconfigure(sch);
+ sch_tree_unlock(sch);
+ }
+
+ return 0;
+}
+
+static void cake_destroy(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ tcf_block_put(q->block);
+ kvfree(q->tins);
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int i, j, err;
+
+ sch->limit = 10240;
+ q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
+ q->flow_mode = CAKE_FLOW_TRIPLE;
+
+ q->rate_bps = 0; /* unlimited by default */
+
+ q->interval = 100000; /* 100ms default */
+ q->target = 5000; /* 5ms: codel RFC argues
+ * for 5 to 10% of interval
+ */
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ q->cur_tin = 0;
+ q->cur_flow = 0;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt) {
+ err = cake_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ quantum_div[0] = ~0;
+ for (i = 1; i <= CAKE_QUEUES; i++)
+ quantum_div[i] = 65535 / i;
+
+ q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
+ GFP_KERNEL);
+ if (!q->tins)
+ return -ENOMEM;
+
+ for (i = 0; i < CAKE_MAX_TINS; i++) {
+ struct cake_tin_data *b = q->tins + i;
+
+ INIT_LIST_HEAD(&b->new_flows);
+ INIT_LIST_HEAD(&b->old_flows);
+ INIT_LIST_HEAD(&b->decaying_flows);
+ b->sparse_flow_count = 0;
+ b->bulk_flow_count = 0;
+ b->decaying_flow_count = 0;
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ struct cake_flow *flow = b->flows + j;
+ u32 k = j * CAKE_MAX_TINS + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ cobalt_vars_init(&flow->cvars);
+
+ q->overflow_heap[k].t = i;
+ q->overflow_heap[k].b = j;
+ b->overflow_idx[j] = k;
+ }
+ }
+
+ cake_reconfigure(sch);
+ q->avg_peak_bandwidth = q->rate_bps;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ return 0;
+}
+
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
+ TCA_CAKE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
+ q->flow_mode & CAKE_FLOW_MASK))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
+ !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_INGRESS,
+ !!(q->rate_flags & CAKE_FLAG_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_NAT,
+ !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_WASH,
+ !!(q->rate_flags & CAKE_FLAG_WASH)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
+ goto nla_put_failure;
+
+ if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
+ if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
+ !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tstats, *ts;
+ int i;
+
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
+ data, TCA_CAKE_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
+ PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
+ PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
+ PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
+ PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
+ PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
+ PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
+ PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+
+#undef PUT_STAT_U32
+#undef PUT_STAT_U64
+
+ tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ if (!tstats)
+ goto nla_put_failure;
+
+#define PUT_TSTAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_TSTAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
+ data, TCA_CAKE_TIN_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ ts = nla_nest_start(d->skb, i + 1);
+ if (!ts)
+ goto nla_put_failure;
+
+ PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
+ PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
+ PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
+
+ PUT_TSTAT_U32(TARGET_US,
+ ktime_to_us(ns_to_ktime(b->cparams.target)));
+ PUT_TSTAT_U32(INTERVAL_US,
+ ktime_to_us(ns_to_ktime(b->cparams.interval)));
+
+ PUT_TSTAT_U32(SENT_PACKETS, b->packets);
+ PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
+ PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
+ PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
+
+ PUT_TSTAT_U32(PEAK_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->peak_delay)));
+ PUT_TSTAT_U32(AVG_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->avge_delay)));
+ PUT_TSTAT_U32(BASE_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->base_delay)));
+
+ PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
+ PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
+ PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
+
+ PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
+ b->decaying_flow_count);
+ PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
+ PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
+ PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
+
+ PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
+ nla_nest_end(d->skb, ts);
+ }
+
+#undef PUT_TSTAT_U32
+#undef PUT_TSTAT_U64
+
+ nla_nest_end(d->skb, tstats);
+ return nla_nest_end(d->skb, stats);
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long cake_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void cake_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ const struct cake_flow *flow = NULL;
+ struct gnet_stats_queue qs = { 0 };
+ struct nlattr *stats;
+ u32 idx = cl - 1;
+
+ if (idx < CAKE_QUEUES * q->tin_cnt) {
+ const struct cake_tin_data *b = \
+ &q->tins[q->tin_order[idx / CAKE_QUEUES]];
+ const struct sk_buff *skb;
+
+ flow = &b->flows[idx % CAKE_QUEUES];
+
+ if (flow->head) {
+ sch_tree_lock(sch);
+ skb = flow->head;
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ sch_tree_unlock(sch);
+ }
+ qs.backlog = b->backlogs[idx % CAKE_QUEUES];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ if (flow) {
+ ktime_t now = ktime_get();
+
+ stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_S32(attr, data) do { \
+ if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_S32(DEFICIT, flow->deficit);
+ PUT_STAT_U32(DROPPING, flow->cvars.dropping);
+ PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
+ PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
+ if (flow->cvars.p_drop) {
+ PUT_STAT_S32(BLUE_TIMER_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.blue_timer)));
+ }
+ if (flow->cvars.dropping) {
+ PUT_STAT_S32(DROP_NEXT_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.drop_next)));
+ }
+
+ if (nla_nest_end(d->skb, stats) < 0)
+ return -1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ unsigned int i, j;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ if (list_empty(&b->flows[j].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops cake_class_ops = {
+ .leaf = cake_leaf,
+ .find = cake_find,
+ .tcf_block = cake_tcf_block,
+ .bind_tcf = cake_bind,
+ .unbind_tcf = cake_unbind,
+ .dump = cake_dump_class,
+ .dump_stats = cake_dump_class_stats,
+ .walk = cake_walk,
+};
+
+static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
+ .cl_ops = &cake_class_ops,
+ .id = "cake",
+ .priv_size = sizeof(struct cake_sched_data),
+ .enqueue = cake_enqueue,
+ .dequeue = cake_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cake_init,
+ .reset = cake_reset,
+ .destroy = cake_destroy,
+ .change = cake_change,
+ .dump = cake_dump,
+ .dump_stats = cake_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cake_module_init(void)
+{
+ return register_qdisc(&cake_qdisc_ops);
+}
+
+static void __exit cake_module_exit(void)
+{
+ unregister_qdisc(&cake_qdisc_ops);
+}
+
+module_init(cake_module_init)
+module_exit(cake_module_exit)
+MODULE_AUTHOR("Jonathan Morton");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("The CAKE shaper.");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000..0a76ad05e
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,1823 @@
+/*
+ * net/sched/sch_cbq.c Class-Based Queueing discipline.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/* Class-Based Queueing (CBQ) algorithm.
+ =======================================
+
+ Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+ Management Models for Packet Networks",
+ IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+ [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+
+ [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+ Parameters", 1996
+
+ [4] Sally Floyd and Michael Speer, "Experimental Results
+ for Class-Based Queueing", 1998, not published.
+
+ -----------------------------------------------------------------------
+
+ Algorithm skeleton was taken from NS simulator cbq.cc.
+ If someone wants to check this code against the LBL version,
+ he should take into account that ONLY the skeleton was borrowed,
+ the implementation is different. Particularly:
+
+ --- The WRR algorithm is different. Our version looks more
+ reasonable (I hope) and works when quanta are allowed to be
+ less than MTU, which is always the case when real time classes
+ have small rates. Note, that the statement of [3] is
+ incomplete, delay may actually be estimated even if class
+ per-round allotment is less than MTU. Namely, if per-round
+ allotment is W*r_i, and r_1+...+r_k = r < 1
+
+ delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
+
+ In the worst case we have IntServ estimate with D = W*r+k*MTU
+ and C = MTU*r. The proof (if correct at all) is trivial.
+
+
+ --- It seems that cbq-2.0 is not very accurate. At least, I cannot
+ interpret some places, which look like wrong translations
+ from NS. Anyone is advised to find these differences
+ and explain to me, why I am wrong 8).
+
+ --- Linux has no EOI event, so that we cannot estimate true class
+ idle time. Workaround is to consider the next dequeue event
+ as sign that previous packet is finished. This is wrong because of
+ internal device queueing, but on a permanently loaded link it is true.
+ Moreover, combined with clock integrator, this scheme looks
+ very close to an ideal solution. */
+
+struct cbq_sched_data;
+
+
+struct cbq_class {
+ struct Qdisc_class_common common;
+ struct cbq_class *next_alive; /* next class with backlog in this priority band */
+
+/* Parameters */
+ unsigned char priority; /* class priority */
+ unsigned char priority2; /* priority to be used after overlimit */
+ unsigned char ewma_log; /* time constant for idle time calculation */
+
+ u32 defmap;
+
+ /* Link-sharing scheduler parameters */
+ long maxidle; /* Class parameters: see below. */
+ long offtime;
+ long minidle;
+ u32 avpkt;
+ struct qdisc_rate_table *R_tab;
+
+ /* General scheduler (WRR) parameters */
+ long allot;
+ long quantum; /* Allotment per WRR round */
+ long weight; /* Relative allotment: see below */
+
+ struct Qdisc *qdisc; /* Ptr to CBQ discipline */
+ struct cbq_class *split; /* Ptr to split node */
+ struct cbq_class *share; /* Ptr to LS parent in the class tree */
+ struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
+ struct cbq_class *borrow; /* NULL if class is bandwidth limited;
+ parent otherwise */
+ struct cbq_class *sibling; /* Sibling chain */
+ struct cbq_class *children; /* Pointer to children chain */
+
+ struct Qdisc *q; /* Elementary queueing discipline */
+
+
+/* Variables */
+ unsigned char cpriority; /* Effective priority */
+ unsigned char delayed;
+ unsigned char level; /* level of the class in hierarchy:
+ 0 for leaf classes, and maximal
+ level of children + 1 for nodes.
+ */
+
+ psched_time_t last; /* Last end of service */
+ psched_time_t undertime;
+ long avgidle;
+ long deficit; /* Saved deficit for WRR */
+ psched_time_t penalized;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct net_rate_estimator __rcu *rate_est;
+ struct tc_cbq_xstats xstats;
+
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+
+ int filters;
+
+ struct cbq_class *defaults[TC_PRIO_MAX + 1];
+};
+
+struct cbq_sched_data {
+ struct Qdisc_class_hash clhash; /* Hash table of all classes */
+ int nclasses[TC_CBQ_MAXPRIO + 1];
+ unsigned int quanta[TC_CBQ_MAXPRIO + 1];
+
+ struct cbq_class link;
+
+ unsigned int activemask;
+ struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
+ with backlog */
+
+#ifdef CONFIG_NET_CLS_ACT
+ struct cbq_class *rx_class;
+#endif
+ struct cbq_class *tx_class;
+ struct cbq_class *tx_borrowed;
+ int tx_len;
+ psched_time_t now; /* Cached timestamp */
+ unsigned int pmask;
+
+ struct hrtimer delay_timer;
+ struct qdisc_watchdog watchdog; /* Watchdog timer,
+ started when CBQ has
+ backlog, but cannot
+ transmit just now */
+ psched_tdiff_t wd_expires;
+ int toplevel;
+ u32 hgenerator;
+};
+
+
+#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
+
+static inline struct cbq_class *
+cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
+{
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct cbq_class, common);
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+
+static struct cbq_class *
+cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
+{
+ struct cbq_class *cl;
+
+ for (cl = this->tparent; cl; cl = cl->tparent) {
+ struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+
+ if (new != NULL && new != this)
+ return new;
+ }
+ return NULL;
+}
+
+#endif
+
+/* Classify packet. The procedure is pretty complicated, but
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
+ */
+
+static struct cbq_class *
+cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *head = &q->link;
+ struct cbq_class **defmap;
+ struct cbq_class *cl = NULL;
+ u32 prio = skb->priority;
+ struct tcf_proto *fl;
+ struct tcf_result res;
+
+ /*
+ * Step 1. If skb->priority points to one of our classes, use it.
+ */
+ if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
+ (cl = cbq_class_lookup(q, prio)) != NULL)
+ return cl;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ for (;;) {
+ int result = 0;
+ defmap = head->defaults;
+
+ fl = rcu_dereference_bh(head->filter_list);
+ /*
+ * Step 2+n. Apply classifier.
+ */
+ result = tcf_classify(skb, fl, &res, true);
+ if (!fl || result < 0)
+ goto fallback;
+
+ cl = (void *)res.class;
+ if (!cl) {
+ if (TC_H_MAJ(res.classid))
+ cl = cbq_class_lookup(q, res.classid);
+ else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
+ cl = defmap[TC_PRIO_BESTEFFORT];
+
+ if (cl == NULL)
+ goto fallback;
+ }
+ if (cl->level >= head->level)
+ goto fallback;
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ case TC_ACT_RECLASSIFY:
+ return cbq_reclassify(skb, cl);
+ }
+#endif
+ if (cl->level == 0)
+ return cl;
+
+ /*
+ * Step 3+n. If classifier selected a link sharing class,
+ * apply agency specific classifier.
+ * Repeat this procdure until we hit a leaf node.
+ */
+ head = cl;
+ }
+
+fallback:
+ cl = head;
+
+ /*
+ * Step 4. No success...
+ */
+ if (TC_H_MAJ(prio) == 0 &&
+ !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
+ !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
+ return head;
+
+ return cl;
+}
+
+/*
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
+ */
+
+static inline void cbq_activate_class(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ int prio = cl->cpriority;
+ struct cbq_class *cl_tail;
+
+ cl_tail = q->active[prio];
+ q->active[prio] = cl;
+
+ if (cl_tail != NULL) {
+ cl->next_alive = cl_tail->next_alive;
+ cl_tail->next_alive = cl;
+ } else {
+ cl->next_alive = cl;
+ q->activemask |= (1<<prio);
+ }
+}
+
+/*
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
+ */
+
+static void cbq_deactivate_class(struct cbq_class *this)
+{
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+ int prio = this->cpriority;
+ struct cbq_class *cl;
+ struct cbq_class *cl_prev = q->active[prio];
+
+ do {
+ cl = cl_prev->next_alive;
+ if (cl == this) {
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+
+ if (cl == q->active[prio]) {
+ q->active[prio] = cl_prev;
+ if (cl == q->active[prio]) {
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ return;
+ }
+ }
+ return;
+ }
+ } while ((cl_prev = cl) != q->active[prio]);
+}
+
+static void
+cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ int toplevel = q->toplevel;
+
+ if (toplevel > cl->level) {
+ psched_time_t now = psched_get_time();
+
+ do {
+ if (cl->undertime < now) {
+ q->toplevel = cl->level;
+ return;
+ }
+ } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
+ }
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ int uninitialized_var(ret);
+ struct cbq_class *cl = cbq_classify(skb, sch, &ret);
+
+#ifdef CONFIG_NET_CLS_ACT
+ q->rx_class = cl;
+#endif
+ if (cl == NULL) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+
+ ret = qdisc_enqueue(skb, cl->q, to_free);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ cbq_mark_toplevel(q, cl);
+ if (!cl->next_alive)
+ cbq_activate_class(cl);
+ return ret;
+ }
+
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ cbq_mark_toplevel(q, cl);
+ cl->qstats.drops++;
+ }
+ return ret;
+}
+
+/* Overlimit action: penalize leaf class by adding offtime */
+static void cbq_overlimit(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ psched_tdiff_t delay = cl->undertime - q->now;
+
+ if (!cl->delayed) {
+ delay += cl->offtime;
+
+ /*
+ * Class goes to sleep, so that it will have no
+ * chance to work avgidle. Let's forgive it 8)
+ *
+ * BTW cbq-2.0 has a crap in this
+ * place, apparently they forgot to shift it by cl->ewma_log.
+ */
+ if (cl->avgidle < 0)
+ delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+ if (cl->avgidle < cl->minidle)
+ cl->avgidle = cl->minidle;
+ if (delay <= 0)
+ delay = 1;
+ cl->undertime = q->now + delay;
+
+ cl->xstats.overactions++;
+ cl->delayed = 1;
+ }
+ if (q->wd_expires == 0 || q->wd_expires > delay)
+ q->wd_expires = delay;
+
+ /* Dirty work! We must schedule wakeups based on
+ * real available rate, rather than leaf rate,
+ * which may be tiny (even zero).
+ */
+ if (q->toplevel == TC_CBQ_MAXLEVEL) {
+ struct cbq_class *b;
+ psched_tdiff_t base_delay = q->wd_expires;
+
+ for (b = cl->borrow; b; b = b->borrow) {
+ delay = b->undertime - q->now;
+ if (delay < base_delay) {
+ if (delay <= 0)
+ delay = 1;
+ base_delay = delay;
+ }
+ }
+
+ q->wd_expires = base_delay;
+ }
+}
+
+static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
+ psched_time_t now)
+{
+ struct cbq_class *cl;
+ struct cbq_class *cl_prev = q->active[prio];
+ psched_time_t sched = now;
+
+ if (cl_prev == NULL)
+ return 0;
+
+ do {
+ cl = cl_prev->next_alive;
+ if (now - cl->penalized > 0) {
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+ cl->cpriority = cl->priority;
+ cl->delayed = 0;
+ cbq_activate_class(cl);
+
+ if (cl == q->active[prio]) {
+ q->active[prio] = cl_prev;
+ if (cl == q->active[prio]) {
+ q->active[prio] = NULL;
+ return 0;
+ }
+ }
+
+ cl = cl_prev->next_alive;
+ } else if (sched - cl->penalized > 0)
+ sched = cl->penalized;
+ } while ((cl_prev = cl) != q->active[prio]);
+
+ return sched - now;
+}
+
+static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
+{
+ struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
+ delay_timer);
+ struct Qdisc *sch = q->watchdog.qdisc;
+ psched_time_t now;
+ psched_tdiff_t delay = 0;
+ unsigned int pmask;
+
+ now = psched_get_time();
+
+ pmask = q->pmask;
+ q->pmask = 0;
+
+ while (pmask) {
+ int prio = ffz(~pmask);
+ psched_tdiff_t tmp;
+
+ pmask &= ~(1<<prio);
+
+ tmp = cbq_undelay_prio(q, prio, now);
+ if (tmp > 0) {
+ q->pmask |= 1<<prio;
+ if (tmp < delay || delay == 0)
+ delay = tmp;
+ }
+ }
+
+ if (delay) {
+ ktime_t time;
+
+ time = 0;
+ time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
+ hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
+ }
+
+ __netif_schedule(qdisc_root(sch));
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
+
+static inline void
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+ struct cbq_class *borrowed)
+{
+ if (cl && q->toplevel >= borrowed->level) {
+ if (cl->q->q.qlen > 1) {
+ do {
+ if (borrowed->undertime == PSCHED_PASTPERFECT) {
+ q->toplevel = borrowed->level;
+ return;
+ }
+ } while ((borrowed = borrowed->borrow) != NULL);
+ }
+#if 0
+ /* It is not necessary now. Uncommenting it
+ will save CPU cycles, but decrease fairness.
+ */
+ q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
+ }
+}
+
+static void
+cbq_update(struct cbq_sched_data *q)
+{
+ struct cbq_class *this = q->tx_class;
+ struct cbq_class *cl = this;
+ int len = q->tx_len;
+ psched_time_t now;
+
+ q->tx_class = NULL;
+ /* Time integrator. We calculate EOS time
+ * by adding expected packet transmission time.
+ */
+ now = q->now + L2T(&q->link, len);
+
+ for ( ; cl; cl = cl->share) {
+ long avgidle = cl->avgidle;
+ long idle;
+
+ cl->bstats.packets++;
+ cl->bstats.bytes += len;
+
+ /*
+ * (now - last) is total time between packet right edges.
+ * (last_pktlen/rate) is "virtual" busy time, so that
+ *
+ * idle = (now - last) - last_pktlen/rate
+ */
+
+ idle = now - cl->last;
+ if ((unsigned long)idle > 128*1024*1024) {
+ avgidle = cl->maxidle;
+ } else {
+ idle -= L2T(cl, len);
+
+ /* true_avgidle := (1-W)*true_avgidle + W*idle,
+ * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+ * cl->avgidle == true_avgidle/W,
+ * hence:
+ */
+ avgidle += idle - (avgidle>>cl->ewma_log);
+ }
+
+ if (avgidle <= 0) {
+ /* Overlimit or at-limit */
+
+ if (avgidle < cl->minidle)
+ avgidle = cl->minidle;
+
+ cl->avgidle = avgidle;
+
+ /* Calculate expected time, when this class
+ * will be allowed to send.
+ * It will occur, when:
+ * (1-W)*true_avgidle + W*delay = 0, i.e.
+ * idle = (1/W - 1)*(-true_avgidle)
+ * or
+ * idle = (1 - W)*(-cl->avgidle);
+ */
+ idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
+
+ /*
+ * That is not all.
+ * To maintain the rate allocated to the class,
+ * we add to undertime virtual clock,
+ * necessary to complete transmitted packet.
+ * (len/phys_bandwidth has been already passed
+ * to the moment of cbq_update)
+ */
+
+ idle -= L2T(&q->link, len);
+ idle += L2T(cl, len);
+
+ cl->undertime = now + idle;
+ } else {
+ /* Underlimit */
+
+ cl->undertime = PSCHED_PASTPERFECT;
+ if (avgidle > cl->maxidle)
+ cl->avgidle = cl->maxidle;
+ else
+ cl->avgidle = avgidle;
+ }
+ if ((s64)(now - cl->last) > 0)
+ cl->last = now;
+ }
+
+ cbq_update_toplevel(q, this, q->tx_borrowed);
+}
+
+static inline struct cbq_class *
+cbq_under_limit(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ struct cbq_class *this_cl = cl;
+
+ if (cl->tparent == NULL)
+ return cl;
+
+ if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
+ cl->delayed = 0;
+ return cl;
+ }
+
+ do {
+ /* It is very suspicious place. Now overlimit
+ * action is generated for not bounded classes
+ * only if link is completely congested.
+ * Though it is in agree with ancestor-only paradigm,
+ * it looks very stupid. Particularly,
+ * it means that this chunk of code will either
+ * never be called or result in strong amplification
+ * of burstiness. Dangerous, silly, and, however,
+ * no another solution exists.
+ */
+ cl = cl->borrow;
+ if (!cl) {
+ this_cl->qstats.overlimits++;
+ cbq_overlimit(this_cl);
+ return NULL;
+ }
+ if (cl->level > q->toplevel)
+ return NULL;
+ } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
+
+ cl->delayed = 0;
+ return cl;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl_tail, *cl_prev, *cl;
+ struct sk_buff *skb;
+ int deficit;
+
+ cl_tail = cl_prev = q->active[prio];
+ cl = cl_prev->next_alive;
+
+ do {
+ deficit = 0;
+
+ /* Start round */
+ do {
+ struct cbq_class *borrow = cl;
+
+ if (cl->q->q.qlen &&
+ (borrow = cbq_under_limit(cl)) == NULL)
+ goto skip_class;
+
+ if (cl->deficit <= 0) {
+ /* Class exhausted its allotment per
+ * this round. Switch to the next one.
+ */
+ deficit = 1;
+ cl->deficit += cl->quantum;
+ goto next_class;
+ }
+
+ skb = cl->q->dequeue(cl->q);
+
+ /* Class did not give us any skb :-(
+ * It could occur even if cl->q->q.qlen != 0
+ * f.e. if cl->q == "tbf"
+ */
+ if (skb == NULL)
+ goto skip_class;
+
+ cl->deficit -= qdisc_pkt_len(skb);
+ q->tx_class = cl;
+ q->tx_borrowed = borrow;
+ if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+ borrow->xstats.borrows++;
+ cl->xstats.borrows++;
+#else
+ borrow->xstats.borrows += qdisc_pkt_len(skb);
+ cl->xstats.borrows += qdisc_pkt_len(skb);
+#endif
+ }
+ q->tx_len = qdisc_pkt_len(skb);
+
+ if (cl->deficit <= 0) {
+ q->active[prio] = cl;
+ cl = cl->next_alive;
+ cl->deficit += cl->quantum;
+ }
+ return skb;
+
+skip_class:
+ if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
+ /* Class is empty or penalized.
+ * Unlink it from active chain.
+ */
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+
+ /* Did cl_tail point to it? */
+ if (cl == cl_tail) {
+ /* Repair it! */
+ cl_tail = cl_prev;
+
+ /* Was it the last class in this band? */
+ if (cl == cl_tail) {
+ /* Kill the band! */
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+ return NULL;
+ }
+
+ q->active[prio] = cl_tail;
+ }
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+
+ cl = cl_prev;
+ }
+
+next_class:
+ cl_prev = cl;
+ cl = cl->next_alive;
+ } while (cl_prev != cl_tail);
+ } while (deficit);
+
+ q->active[prio] = cl_prev;
+
+ return NULL;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ unsigned int activemask;
+
+ activemask = q->activemask & 0xFF;
+ while (activemask) {
+ int prio = ffz(~activemask);
+ activemask &= ~(1<<prio);
+ skb = cbq_dequeue_prio(sch, prio);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ psched_time_t now;
+
+ now = psched_get_time();
+
+ if (q->tx_class)
+ cbq_update(q);
+
+ q->now = now;
+
+ for (;;) {
+ q->wd_expires = 0;
+
+ skb = cbq_dequeue_1(sch);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+
+ /* All the classes are overlimit.
+ *
+ * It is possible, if:
+ *
+ * 1. Scheduler is empty.
+ * 2. Toplevel cutoff inhibited borrowing.
+ * 3. Root class is overlimit.
+ *
+ * Reset 2d and 3d conditions and retry.
+ *
+ * Note, that NS and cbq-2.0 are buggy, peeking
+ * an arbitrary class is appropriate for ancestor-only
+ * sharing, but not for toplevel algorithm.
+ *
+ * Our version is better, but slower, because it requires
+ * two passes, but it is unavoidable with top-level sharing.
+ */
+
+ if (q->toplevel == TC_CBQ_MAXLEVEL &&
+ q->link.undertime == PSCHED_PASTPERFECT)
+ break;
+
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->link.undertime = PSCHED_PASTPERFECT;
+ }
+
+ /* No packets in scheduler or nobody wants to give them to us :-(
+ * Sigh... start watchdog timer in the last case.
+ */
+
+ if (sch->q.qlen) {
+ qdisc_qstats_overlimit(sch);
+ if (q->wd_expires)
+ qdisc_watchdog_schedule(&q->watchdog,
+ now + q->wd_expires);
+ }
+ return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+ if (this == NULL)
+ return;
+
+ do {
+ int level = 0;
+ struct cbq_class *cl;
+
+ cl = this->children;
+ if (cl) {
+ do {
+ if (cl->level > level)
+ level = cl->level;
+ } while ((cl = cl->sibling) != this->children);
+ }
+ this->level = level + 1;
+ } while ((this = this->tparent) != NULL);
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+ struct cbq_class *cl;
+ unsigned int h;
+
+ if (q->quanta[prio] == 0)
+ return;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ /* BUGGGG... Beware! This expression suffer of
+ * arithmetic overflows!
+ */
+ if (cl->priority == prio) {
+ cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+ q->quanta[prio];
+ }
+ if (cl->quantum <= 0 ||
+ cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
+ pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+ cl->common.classid, cl->quantum);
+ cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+ }
+ }
+ }
+}
+
+static void cbq_sync_defmap(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ struct cbq_class *split = cl->split;
+ unsigned int h;
+ int i;
+
+ if (split == NULL)
+ return;
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
+ split->defaults[i] = NULL;
+ }
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ int level = split->level;
+
+ if (split->defaults[i])
+ continue;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ struct cbq_class *c;
+
+ hlist_for_each_entry(c, &q->clhash.hash[h],
+ common.hnode) {
+ if (c->split == split && c->level < level &&
+ c->defmap & (1<<i)) {
+ split->defaults[i] = c;
+ level = c->level;
+ }
+ }
+ }
+ }
+}
+
+static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
+{
+ struct cbq_class *split = NULL;
+
+ if (splitid == 0) {
+ split = cl->split;
+ if (!split)
+ return;
+ splitid = split->common.classid;
+ }
+
+ if (split == NULL || split->common.classid != splitid) {
+ for (split = cl->tparent; split; split = split->tparent)
+ if (split->common.classid == splitid)
+ break;
+ }
+
+ if (split == NULL)
+ return;
+
+ if (cl->split != split) {
+ cl->defmap = 0;
+ cbq_sync_defmap(cl);
+ cl->split = split;
+ cl->defmap = def & mask;
+ } else
+ cl->defmap = (cl->defmap & ~mask) | (def & mask);
+
+ cbq_sync_defmap(cl);
+}
+
+static void cbq_unlink_class(struct cbq_class *this)
+{
+ struct cbq_class *cl, **clp;
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+
+ qdisc_class_hash_remove(&q->clhash, &this->common);
+
+ if (this->tparent) {
+ clp = &this->sibling;
+ cl = *clp;
+ do {
+ if (cl == this) {
+ *clp = cl->sibling;
+ break;
+ }
+ clp = &cl->sibling;
+ } while ((cl = *clp) != this->sibling);
+
+ if (this->tparent->children == this) {
+ this->tparent->children = this->sibling;
+ if (this->sibling == this)
+ this->tparent->children = NULL;
+ }
+ } else {
+ WARN_ON(this->sibling != this);
+ }
+}
+
+static void cbq_link_class(struct cbq_class *this)
+{
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+ struct cbq_class *parent = this->tparent;
+
+ this->sibling = this;
+ qdisc_class_hash_insert(&q->clhash, &this->common);
+
+ if (parent == NULL)
+ return;
+
+ if (parent->children == NULL) {
+ parent->children = this;
+ } else {
+ this->sibling = parent->children->sibling;
+ parent->children->sibling = this;
+ }
+}
+
+static void
+cbq_reset(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl;
+ int prio;
+ unsigned int h;
+
+ q->activemask = 0;
+ q->pmask = 0;
+ q->tx_class = NULL;
+ q->tx_borrowed = NULL;
+ qdisc_watchdog_cancel(&q->watchdog);
+ hrtimer_cancel(&q->delay_timer);
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->now = psched_get_time();
+
+ for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
+ q->active[prio] = NULL;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ qdisc_reset(cl->q);
+
+ cl->next_alive = NULL;
+ cl->undertime = PSCHED_PASTPERFECT;
+ cl->avgidle = cl->maxidle;
+ cl->deficit = cl->quantum;
+ cl->cpriority = cl->priority;
+ }
+ }
+ sch->q.qlen = 0;
+}
+
+
+static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
+{
+ if (lss->change & TCF_CBQ_LSS_FLAGS) {
+ cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+ cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+ }
+ if (lss->change & TCF_CBQ_LSS_EWMA)
+ cl->ewma_log = lss->ewma_log;
+ if (lss->change & TCF_CBQ_LSS_AVPKT)
+ cl->avpkt = lss->avpkt;
+ if (lss->change & TCF_CBQ_LSS_MINIDLE)
+ cl->minidle = -(long)lss->minidle;
+ if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
+ cl->maxidle = lss->maxidle;
+ cl->avgidle = lss->maxidle;
+ }
+ if (lss->change & TCF_CBQ_LSS_OFFTIME)
+ cl->offtime = lss->offtime;
+ return 0;
+}
+
+static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ q->nclasses[cl->priority]--;
+ q->quanta[cl->priority] -= cl->weight;
+ cbq_normalize_quanta(q, cl->priority);
+}
+
+static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ q->nclasses[cl->priority]++;
+ q->quanta[cl->priority] += cl->weight;
+ cbq_normalize_quanta(q, cl->priority);
+}
+
+static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+ if (wrr->allot)
+ cl->allot = wrr->allot;
+ if (wrr->weight)
+ cl->weight = wrr->weight;
+ if (wrr->priority) {
+ cl->priority = wrr->priority - 1;
+ cl->cpriority = cl->priority;
+ if (cl->priority >= cl->priority2)
+ cl->priority2 = TC_CBQ_MAXPRIO - 1;
+ }
+
+ cbq_addprio(q, cl);
+ return 0;
+}
+
+static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
+{
+ cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
+ return 0;
+}
+
+static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
+ [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
+ [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
+ [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
+ [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
+ [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
+ [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
+};
+
+static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
+
+ if (wrr->priority > TC_CBQ_MAXPRIO) {
+ NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
+ err = -EINVAL;
+ }
+ }
+ return err;
+}
+
+static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CBQ_MAX + 1];
+ struct tc_ratespec *r;
+ int err;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+ hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ q->delay_timer.function = cbq_undelay;
+
+ err = cbq_opt_parse(tb, opt, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
+ NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
+ return -EINVAL;
+ }
+
+ r = nla_data(tb[TCA_CBQ_RATE]);
+
+ q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
+ if (!q->link.R_tab)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
+ if (err)
+ goto put_rtab;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ goto put_block;
+
+ q->link.sibling = &q->link;
+ q->link.common.classid = sch->handle;
+ q->link.qdisc = sch;
+ q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (!q->link.q)
+ q->link.q = &noop_qdisc;
+ else
+ qdisc_hash_add(q->link.q, true);
+
+ q->link.priority = TC_CBQ_MAXPRIO - 1;
+ q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+ q->link.cpriority = TC_CBQ_MAXPRIO - 1;
+ q->link.allot = psched_mtu(qdisc_dev(sch));
+ q->link.quantum = q->link.allot;
+ q->link.weight = q->link.R_tab->rate.rate;
+
+ q->link.ewma_log = TC_CBQ_DEF_EWMA;
+ q->link.avpkt = q->link.allot/2;
+ q->link.minidle = -0x7FFFFFFF;
+
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->now = psched_get_time();
+
+ cbq_link_class(&q->link);
+
+ if (tb[TCA_CBQ_LSSOPT])
+ cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+ cbq_addprio(q, &q->link);
+ return 0;
+
+put_block:
+ tcf_block_put(q->link.block);
+
+put_rtab:
+ qdisc_put_rtab(q->link.R_tab);
+ return err;
+}
+
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+
+ if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_lssopt opt;
+
+ opt.flags = 0;
+ if (cl->borrow == NULL)
+ opt.flags |= TCF_CBQ_LSS_BOUNDED;
+ if (cl->share == NULL)
+ opt.flags |= TCF_CBQ_LSS_ISOLATED;
+ opt.ewma_log = cl->ewma_log;
+ opt.level = cl->level;
+ opt.avpkt = cl->avpkt;
+ opt.maxidle = cl->maxidle;
+ opt.minidle = (u32)(-cl->minidle);
+ opt.offtime = cl->offtime;
+ opt.change = ~0;
+ if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_wrropt opt;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.flags = 0;
+ opt.allot = cl->allot;
+ opt.priority = cl->priority + 1;
+ opt.cpriority = cl->cpriority + 1;
+ opt.weight = cl->weight;
+ if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_fopt opt;
+
+ if (cl->split || cl->defmap) {
+ opt.split = cl->split ? cl->split->common.classid : 0;
+ opt.defmap = cl->defmap;
+ opt.defchange = ~0;
+ if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ }
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
+{
+ if (cbq_dump_lss(skb, cl) < 0 ||
+ cbq_dump_rate(skb, cl) < 0 ||
+ cbq_dump_wrr(skb, cl) < 0 ||
+ cbq_dump_fopt(skb, cl) < 0)
+ return -1;
+ return 0;
+}
+
+static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (cbq_dump_attr(skb, &q->link) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ q->link.xstats.avgidle = q->link.avgidle;
+ return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
+}
+
+static int
+cbq_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+ struct nlattr *nest;
+
+ if (cl->tparent)
+ tcm->tcm_parent = cl->tparent->common.classid;
+ else
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (cbq_dump_attr(skb, cl) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ cl->xstats.avgidle = cl->avgidle;
+ cl->xstats.undertime = 0;
+
+ if (cl->undertime != PSCHED_PASTPERFECT)
+ cl->xstats.undertime = cl->undertime - q->now;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid, extack);
+ if (new == NULL)
+ return -ENOBUFS;
+ }
+
+ *old = qdisc_replace(sch, new, &cl->q);
+ return 0;
+}
+
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ return cl->q;
+}
+
+static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ cbq_deactivate_class(cl);
+}
+
+static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ return (unsigned long)cbq_class_lookup(q, classid);
+}
+
+static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ WARN_ON(cl->filters);
+
+ tcf_block_put(cl->block);
+ qdisc_put(cl->q);
+ qdisc_put_rtab(cl->R_tab);
+ gen_kill_estimator(&cl->rate_est);
+ if (cl != &q->link)
+ kfree(cl);
+}
+
+static void cbq_destroy(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct cbq_class *cl;
+ unsigned int h;
+
+#ifdef CONFIG_NET_CLS_ACT
+ q->rx_class = NULL;
+#endif
+ /*
+ * Filters must be destroyed first because we don't destroy the
+ * classes from root to leafs which means that filters can still
+ * be bound to classes which have been destroyed already. --TGR '04
+ */
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ tcf_block_put(cl->block);
+ cl->block = NULL;
+ }
+ }
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
+ common.hnode)
+ cbq_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static int
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
+ unsigned long *arg, struct netlink_ext_ack *extack)
+{
+ int err;
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)*arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_CBQ_MAX + 1];
+ struct cbq_class *parent;
+ struct qdisc_rate_table *rtab = NULL;
+
+ err = cbq_opt_parse(tb, opt, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
+ NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
+ return -EOPNOTSUPP;
+ }
+
+ if (cl) {
+ /* Check parent */
+ if (parentid) {
+ if (cl->tparent &&
+ cl->tparent->common.classid != parentid) {
+ NL_SET_ERR_MSG(extack, "Invalid parent id");
+ return -EINVAL;
+ }
+ if (!cl->tparent && parentid != TC_H_ROOT) {
+ NL_SET_ERR_MSG(extack, "Parent must be root");
+ return -EINVAL;
+ }
+ }
+
+ if (tb[TCA_CBQ_RATE]) {
+ rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
+ tb[TCA_CBQ_RTAB], extack);
+ if (rtab == NULL)
+ return -EINVAL;
+ }
+
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
+ qdisc_put_rtab(rtab);
+ return err;
+ }
+ }
+
+ /* Change class parameters */
+ sch_tree_lock(sch);
+
+ if (cl->next_alive != NULL)
+ cbq_deactivate_class(cl);
+
+ if (rtab) {
+ qdisc_put_rtab(cl->R_tab);
+ cl->R_tab = rtab;
+ }
+
+ if (tb[TCA_CBQ_LSSOPT])
+ cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ cbq_rmprio(q, cl);
+ cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+ }
+
+ if (tb[TCA_CBQ_FOPT])
+ cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ if (parentid == TC_H_ROOT)
+ return -EINVAL;
+
+ if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
+ NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
+ return -EINVAL;
+ }
+
+ rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
+ extack);
+ if (rtab == NULL)
+ return -EINVAL;
+
+ if (classid) {
+ err = -EINVAL;
+ if (TC_H_MAJ(classid ^ sch->handle) ||
+ cbq_class_lookup(q, classid)) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ goto failure;
+ }
+ } else {
+ int i;
+ classid = TC_H_MAKE(sch->handle, 0x8000);
+
+ for (i = 0; i < 0x8000; i++) {
+ if (++q->hgenerator >= 0x8000)
+ q->hgenerator = 1;
+ if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
+ break;
+ }
+ err = -ENOSR;
+ if (i >= 0x8000) {
+ NL_SET_ERR_MSG(extack, "Unable to generate classid");
+ goto failure;
+ }
+ classid = classid|q->hgenerator;
+ }
+
+ parent = &q->link;
+ if (parentid) {
+ parent = cbq_class_lookup(q, parentid);
+ err = -EINVAL;
+ if (!parent) {
+ NL_SET_ERR_MSG(extack, "Failed to find parentid");
+ goto failure;
+ }
+ }
+
+ err = -ENOBUFS;
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (cl == NULL)
+ goto failure;
+
+ err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
+ tcf_block_put(cl->block);
+ kfree(cl);
+ goto failure;
+ }
+ }
+
+ cl->R_tab = rtab;
+ rtab = NULL;
+ cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
+ NULL);
+ if (!cl->q)
+ cl->q = &noop_qdisc;
+ else
+ qdisc_hash_add(cl->q, true);
+
+ cl->common.classid = classid;
+ cl->tparent = parent;
+ cl->qdisc = sch;
+ cl->allot = parent->allot;
+ cl->quantum = cl->allot;
+ cl->weight = cl->R_tab->rate.rate;
+
+ sch_tree_lock(sch);
+ cbq_link_class(cl);
+ cl->borrow = cl->tparent;
+ if (cl->tparent != &q->link)
+ cl->share = cl->tparent;
+ cbq_adjust_levels(parent);
+ cl->minidle = -0x7FFFFFFF;
+ cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+ cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+ if (cl->ewma_log == 0)
+ cl->ewma_log = q->link.ewma_log;
+ if (cl->maxidle == 0)
+ cl->maxidle = q->link.maxidle;
+ if (cl->avpkt == 0)
+ cl->avpkt = q->link.avpkt;
+ if (tb[TCA_CBQ_FOPT])
+ cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+failure:
+ qdisc_put_rtab(rtab);
+ return err;
+}
+
+static int cbq_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+ unsigned int qlen, backlog;
+
+ if (cl->filters || cl->children || cl == &q->link)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ qlen = cl->q->q.qlen;
+ backlog = cl->q->qstats.backlog;
+ qdisc_reset(cl->q);
+ qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
+
+ if (cl->next_alive)
+ cbq_deactivate_class(cl);
+
+ if (q->tx_borrowed == cl)
+ q->tx_borrowed = q->tx_class;
+ if (q->tx_class == cl) {
+ q->tx_class = NULL;
+ q->tx_borrowed = NULL;
+ }
+#ifdef CONFIG_NET_CLS_ACT
+ if (q->rx_class == cl)
+ q->rx_class = NULL;
+#endif
+
+ cbq_unlink_class(cl);
+ cbq_adjust_levels(cl->tparent);
+ cl->defmap = 0;
+ cbq_sync_defmap(cl);
+
+ cbq_rmprio(q, cl);
+ sch_tree_unlock(sch);
+
+ cbq_destroy_class(sch, cl);
+ return 0;
+}
+
+static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (cl == NULL)
+ cl = &q->link;
+
+ return cl->block;
+}
+
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *p = (struct cbq_class *)parent;
+ struct cbq_class *cl = cbq_class_lookup(q, classid);
+
+ if (cl) {
+ if (p && p->level <= cl->level)
+ return 0;
+ cl->filters++;
+ return (unsigned long)cl;
+ }
+ return 0;
+}
+
+static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ cl->filters--;
+}
+
+static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl;
+ unsigned int h;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops cbq_class_ops = {
+ .graft = cbq_graft,
+ .leaf = cbq_leaf,
+ .qlen_notify = cbq_qlen_notify,
+ .find = cbq_find,
+ .change = cbq_change_class,
+ .delete = cbq_delete,
+ .walk = cbq_walk,
+ .tcf_block = cbq_tcf_block,
+ .bind_tcf = cbq_bind_filter,
+ .unbind_tcf = cbq_unbind_filter,
+ .dump = cbq_dump_class,
+ .dump_stats = cbq_dump_class_stats,
+};
+
+static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &cbq_class_ops,
+ .id = "cbq",
+ .priv_size = sizeof(struct cbq_sched_data),
+ .enqueue = cbq_enqueue,
+ .dequeue = cbq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cbq_init,
+ .reset = cbq_reset,
+ .destroy = cbq_destroy,
+ .change = NULL,
+ .dump = cbq_dump,
+ .dump_stats = cbq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cbq_module_init(void)
+{
+ return register_qdisc(&cbq_qdisc_ops);
+}
+static void __exit cbq_module_exit(void)
+{
+ unregister_qdisc(&cbq_qdisc_ops);
+}
+module_init(cbq_module_init)
+module_exit(cbq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000..f95dc899e
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,588 @@
+/*
+ * net/sched/sch_cbs.c Credit Based Shaper
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+ * =========================
+ *
+ * This is a simple rate-limiting shaper aimed at TSN applications on
+ * systems with known traffic workloads.
+ *
+ * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+ * Section 8.6.8.2, and explained in more detail in the Annex L of the
+ * same specification.
+ *
+ * There are four tunables to be considered:
+ *
+ * 'idleslope': Idleslope is the rate of credits that is
+ * accumulated (in kilobits per second) when there is at least
+ * one packet waiting for transmission. Packets are transmitted
+ * when the current value of credits is equal or greater than
+ * zero. When there is no packet to be transmitted the amount of
+ * credits is set to zero. This is the main tunable of the CBS
+ * algorithm.
+ *
+ * 'sendslope':
+ * Sendslope is the rate of credits that is depleted (it should be a
+ * negative number of kilobits per second) when a transmission is
+ * ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+ * 8.6.8.2 item g):
+ *
+ * sendslope = idleslope - port_transmit_rate
+ *
+ * 'hicredit': Hicredit defines the maximum amount of credits (in
+ * bytes) that can be accumulated. Hicredit depends on the
+ * characteristics of interfering traffic,
+ * 'max_interference_size' is the maximum size of any burst of
+ * traffic that can delay the transmission of a frame that is
+ * available for transmission for this traffic class, (IEEE
+ * 802.1Q-2014 Annex L, Equation L-3):
+ *
+ * hicredit = max_interference_size * (idleslope / port_transmit_rate)
+ *
+ * 'locredit': Locredit is the minimum amount of credits that can
+ * be reached. It is a function of the traffic flowing through
+ * this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+ *
+ * locredit = max_frame_size * (sendslope / port_transmit_rate)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+static LIST_HEAD(cbs_list);
+static DEFINE_SPINLOCK(cbs_list_lock);
+
+#define BYTES_PER_KBIT (1000LL / 8)
+
+struct cbs_sched_data {
+ bool offload;
+ int queue;
+ atomic64_t port_rate; /* in bytes/s */
+ s64 last; /* timestamp in ns */
+ s64 credits; /* in bytes */
+ s32 locredit; /* in bytes */
+ s32 hicredit; /* in bytes */
+ s64 sendslope; /* in bytes/s */
+ s64 idleslope; /* in bytes/s */
+ struct qdisc_watchdog watchdog;
+ int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free);
+ struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct Qdisc *qdisc;
+ struct list_head cbs_list;
+};
+
+static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
+{
+ int err;
+
+ err = child->ops->enqueue(skb, child, to_free);
+ if (err != NET_XMIT_SUCCESS)
+ return err;
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
+}
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ if (sch->q.qlen == 0 && q->credits > 0) {
+ /* We need to stop accumulating credits when there's
+ * no enqueued packets and q->credits is positive.
+ */
+ q->credits = 0;
+ q->last = ktime_get_ns();
+ }
+
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
+}
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ return q->enqueue(skb, sch, to_free);
+}
+
+/* timediff is in ns, slope is in bytes/s */
+static s64 timediff_to_credits(s64 timediff, s64 slope)
+{
+ return div64_s64(timediff * slope, NSEC_PER_SEC);
+}
+
+static s64 delay_from_credits(s64 credits, s64 slope)
+{
+ if (unlikely(slope == 0))
+ return S64_MAX;
+
+ return div64_s64(-credits * NSEC_PER_SEC, slope);
+}
+
+static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
+{
+ if (unlikely(port_rate == 0))
+ return S64_MAX;
+
+ return div64_s64(len * slope, port_rate);
+}
+
+static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child)
+{
+ struct sk_buff *skb;
+
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ return NULL;
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+ s64 now = ktime_get_ns();
+ struct sk_buff *skb;
+ s64 credits;
+ int len;
+
+ /* The previous packet is still being sent */
+ if (now < q->last) {
+ qdisc_watchdog_schedule_ns(&q->watchdog, q->last);
+ return NULL;
+ }
+ if (q->credits < 0) {
+ credits = timediff_to_credits(now - q->last, q->idleslope);
+
+ credits = q->credits + credits;
+ q->credits = min_t(s64, credits, q->hicredit);
+
+ if (q->credits < 0) {
+ s64 delay;
+
+ delay = delay_from_credits(q->credits, q->idleslope);
+ qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);
+
+ q->last = now;
+
+ return NULL;
+ }
+ }
+ skb = cbs_child_dequeue(sch, qdisc);
+ if (!skb)
+ return NULL;
+
+ len = qdisc_pkt_len(skb);
+
+ /* As sendslope is a negative number, this will decrease the
+ * amount of q->credits.
+ */
+ credits = credits_from_len(len, q->sendslope,
+ atomic64_read(&q->port_rate));
+ credits += q->credits;
+
+ q->credits = max_t(s64, credits, q->locredit);
+ /* Estimate of the transmission of the last byte of the packet in ns */
+ if (unlikely(atomic64_read(&q->port_rate) == 0))
+ q->last = now;
+ else
+ q->last = now + div64_s64(len * NSEC_PER_SEC,
+ atomic64_read(&q->port_rate));
+
+ return skb;
+}
+
+static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_dequeue(sch, qdisc);
+}
+
+static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ return q->dequeue(sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+ [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static void cbs_disable_offload(struct net_device *dev,
+ struct cbs_sched_data *q)
+{
+ struct tc_cbs_qopt_offload cbs = { };
+ const struct net_device_ops *ops;
+ int err;
+
+ if (!q->offload)
+ return;
+
+ q->enqueue = cbs_enqueue_soft;
+ q->dequeue = cbs_dequeue_soft;
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_setup_tc)
+ return;
+
+ cbs.queue = q->queue;
+ cbs.enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
+ if (err < 0)
+ pr_warn("Couldn't disable CBS offload for queue %d\n",
+ cbs.queue);
+}
+
+static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
+ const struct tc_cbs_qopt *opt,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_cbs_qopt_offload cbs = { };
+ int err;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload");
+ return -EOPNOTSUPP;
+ }
+
+ cbs.queue = q->queue;
+
+ cbs.enable = 1;
+ cbs.hicredit = opt->hicredit;
+ cbs.locredit = opt->locredit;
+ cbs.idleslope = opt->idleslope;
+ cbs.sendslope = opt->sendslope;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload");
+ return err;
+ }
+
+ q->enqueue = cbs_enqueue_offload;
+ q->dequeue = cbs_dequeue_offload;
+
+ return 0;
+}
+
+static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int speed = SPEED_10;
+ int port_rate = -1;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
+
+skip:
+ port_rate = speed * 1000 * BYTES_PER_KBIT;
+
+ atomic64_set(&q->port_rate, port_rate);
+ netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->port_rate),
+ ecmd.base.speed);
+}
+
+static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct cbs_sched_data *q;
+ struct net_device *qdev;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&cbs_list_lock);
+ list_for_each_entry(q, &cbs_list, cbs_list) {
+ qdev = qdisc_dev(q->qdisc);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&cbs_list_lock);
+
+ if (found)
+ cbs_set_port_rate(dev, q);
+
+ return NOTIFY_DONE;
+}
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_CBS_MAX + 1];
+ struct tc_cbs_qopt *qopt;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CBS_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory");
+ return -EINVAL;
+ }
+
+ qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+ if (!qopt->offload) {
+ cbs_set_port_rate(dev, q);
+ cbs_disable_offload(dev, q);
+ } else {
+ err = cbs_enable_offload(dev, q, qopt, extack);
+ if (err < 0)
+ return err;
+ }
+
+ /* Everything went OK, save the parameters used. */
+ q->hicredit = qopt->hicredit;
+ q->locredit = qopt->locredit;
+ q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
+ q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
+ q->offload = qopt->offload;
+
+ return 0;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory");
+ return -EINVAL;
+ }
+
+ q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, extack);
+ if (!q->qdisc)
+ return -ENOMEM;
+
+ qdisc_hash_add(q->qdisc, false);
+
+ q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+ q->enqueue = cbs_enqueue_soft;
+ q->dequeue = cbs_dequeue_soft;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ err = cbs_change(sch, opt, extack);
+ if (err)
+ return err;
+
+ if (!q->offload) {
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+ }
+
+ return 0;
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ cbs_disable_offload(dev, q);
+
+ if (q->qdisc)
+ qdisc_put(q->qdisc);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct tc_cbs_qopt opt = { };
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ opt.hicredit = q->hicredit;
+ opt.locredit = q->locredit;
+ opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
+ opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
+ opt.offload = q->offload;
+
+ if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int cbs_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (!new)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip) {
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops cbs_class_ops = {
+ .graft = cbs_graft,
+ .leaf = cbs_leaf,
+ .find = cbs_find,
+ .walk = cbs_walk,
+ .dump = cbs_dump_class,
+};
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+ .id = "cbs",
+ .cl_ops = &cbs_class_ops,
+ .priv_size = sizeof(struct cbs_sched_data),
+ .enqueue = cbs_enqueue,
+ .dequeue = cbs_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cbs_init,
+ .reset = qdisc_reset_queue,
+ .destroy = cbs_destroy,
+ .change = cbs_change,
+ .dump = cbs_dump,
+ .owner = THIS_MODULE,
+};
+
+static struct notifier_block cbs_device_notifier = {
+ .notifier_call = cbs_dev_notifier,
+};
+
+static int __init cbs_module_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&cbs_device_notifier);
+ if (err)
+ return err;
+
+ err = register_qdisc(&cbs_qdisc_ops);
+ if (err)
+ unregister_netdevice_notifier(&cbs_device_notifier);
+
+ return err;
+}
+
+static void __exit cbs_module_exit(void)
+{
+ unregister_qdisc(&cbs_qdisc_ops);
+ unregister_netdevice_notifier(&cbs_device_notifier);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 000000000..8b72aed77
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,528 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <net/flow_dissector.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct red_vars vars;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
+ struct sk_buff **to_free)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+ qdisc_drop(skb, sch, to_free);
+ --sch->q.qlen;
+}
+
+struct choke_skb_cb {
+ u16 classid;
+ u8 keys_valid;
+ struct flow_keys_digest keys;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
+ return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ choke_skb_cb(skb)->classid = classid;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ struct flow_keys temp;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ if (!choke_skb_cb(skb1)->keys_valid) {
+ choke_skb_cb(skb1)->keys_valid = 1;
+ skb_flow_dissect_flow_keys(skb1, &temp, 0);
+ make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
+ }
+
+ if (!choke_skb_cb(skb2)->keys_valid) {
+ choke_skb_cb(skb2)->keys_valid = 1;
+ skb_flow_dissect_flow_keys(skb2, &temp, 0);
+ make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
+ }
+
+ return !memcmp(&choke_skb_cb(skb1)->keys,
+ &choke_skb_cb(skb2)->keys,
+ sizeof(choke_skb_cb(skb1)->keys));
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ const struct red_parms *p = &q->parms;
+
+ choke_skb_cb(skb)->keys_valid = 0;
+ /* Compute average queue usage (see RED) */
+ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ /* Is queue small? */
+ if (q->vars.qavg <= p->qth_min)
+ q->vars.qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx, to_free);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (q->vars.qavg > p->qth_max) {
+ q->vars.qcount = -1;
+
+ qdisc_qstats_overlimit(sch);
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++q->vars.qcount) {
+ if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+ q->vars.qcount = 0;
+ q->vars.qR = red_random(p);
+
+ qdisc_qstats_overlimit(sch);
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ q->vars.qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ qdisc_qstats_backlog_inc(sch, skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ return qdisc_drop(skb, sch, to_free);
+
+congestion_drop:
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ rtnl_qdisc_drop(skb, sch);
+ }
+
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+ if (q->tab)
+ memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
+ q->head = q->tail = 0;
+ red_restart(&q->vars);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
+};
+
+
+static void choke_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+ u32 max_P;
+ u8 *stab;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+ stab = nla_data(tb[TCA_CHOKE_STAB]);
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
+ return -EINVAL;
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+ unsigned dropped = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ dropped += qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ --sch->q.qlen;
+ rtnl_qdisc_drop(skb, sch);
+ }
+ qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ stab,
+ max_P);
+ red_set_vars(&q->vars);
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->vars);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ return choke_change(sch, opt, extack);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ choke_free(q->tab);
+}
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 000000000..77fae0b7c
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,309 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *
+ * Implemented on linux by :
+ * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/prefetch.h>
+#include <net/pkt_sched.h>
+#include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
+
+
+#define DEFAULT_CODEL_LIMIT 1000
+
+struct codel_sched_data {
+ struct codel_params params;
+ struct codel_vars vars;
+ struct codel_stats stats;
+ u32 drop_overlimit;
+};
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
+ if (skb) {
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ }
+ return skb;
+}
+
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ skb = codel_dequeue(sch, &sch->qstats.backlog, &q->params, &q->vars,
+ &q->stats, qdisc_pkt_len, codel_get_enqueue_time,
+ drop_func, dequeue_func);
+
+ /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->stats.drop_count && sch->q.qlen) {
+ qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
+ q->stats.drop_count = 0;
+ q->stats.drop_len = 0;
+ }
+ if (skb)
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct codel_sched_data *q;
+
+ if (likely(qdisc_qlen(sch) < sch->limit)) {
+ codel_set_enqueue_time(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ q = qdisc_priv(sch);
+ q->drop_overlimit++;
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
+ [TCA_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_CODEL_ECN] = { .type = NLA_U32 },
+ [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 },
+};
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CODEL_MAX + 1];
+ unsigned int qlen, dropped = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy, NULL);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_CODEL_TARGET]) {
+ u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
+
+ q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_CE_THRESHOLD]) {
+ u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]);
+
+ q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_INTERVAL]) {
+ u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
+
+ q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+
+ if (tb[TCA_CODEL_ECN])
+ q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
+ dropped += qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ rtnl_qdisc_drop(skb, sch);
+ }
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ sch->limit = DEFAULT_CODEL_LIMIT;
+
+ codel_params_init(&q->params);
+ codel_vars_init(&q->vars);
+ codel_stats_init(&q->stats);
+ q->params.mtu = psched_mtu(qdisc_dev(sch));
+
+ if (opt) {
+ int err = codel_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CODEL_TARGET,
+ codel_time_to_us(q->params.target)) ||
+ nla_put_u32(skb, TCA_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_CODEL_INTERVAL,
+ codel_time_to_us(q->params.interval)) ||
+ nla_put_u32(skb, TCA_CODEL_ECN,
+ q->params.ecn))
+ goto nla_put_failure;
+ if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+ nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD,
+ codel_time_to_us(q->params.ce_threshold)))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+}
+
+static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ const struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_xstats st = {
+ .maxpacket = q->stats.maxpacket,
+ .count = q->vars.count,
+ .lastcount = q->vars.lastcount,
+ .drop_overlimit = q->drop_overlimit,
+ .ldelay = codel_time_to_us(q->vars.ldelay),
+ .dropping = q->vars.dropping,
+ .ecn_mark = q->stats.ecn_mark,
+ .ce_mark = q->stats.ce_mark,
+ };
+
+ if (q->vars.dropping) {
+ codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
+
+ if (delta >= 0)
+ st.drop_next = codel_time_to_us(delta);
+ else
+ st.drop_next = -codel_time_to_us(-delta);
+ }
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ codel_vars_init(&q->vars);
+}
+
+static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+ .id = "codel",
+ .priv_size = sizeof(struct codel_sched_data),
+
+ .enqueue = codel_qdisc_enqueue,
+ .dequeue = codel_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = codel_init,
+ .reset = codel_reset,
+ .change = codel_change,
+ .dump = codel_dump,
+ .dump_stats = codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init codel_module_init(void)
+{
+ return register_qdisc(&codel_qdisc_ops);
+}
+
+static void __exit codel_module_exit(void)
+{
+ unregister_qdisc(&codel_qdisc_ops);
+}
+
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+
+MODULE_DESCRIPTION("Controlled Delay queue discipline");
+MODULE_AUTHOR("Dave Taht");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
new file mode 100644
index 000000000..cdebaed0f
--- /dev/null
+++ b/net/sched/sch_drr.c
@@ -0,0 +1,514 @@
+/*
+ * net/sched/sch_drr.c Deficit Round Robin scheduler
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct drr_class {
+ struct Qdisc_class_common common;
+ unsigned int filter_cnt;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct net_rate_estimator __rcu *rate_est;
+ struct list_head alist;
+ struct Qdisc *qdisc;
+
+ u32 quantum;
+ u32 deficit;
+};
+
+struct drr_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ struct Qdisc_class_hash clhash;
+};
+
+static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct drr_class, common);
+}
+
+static void drr_purge_queue(struct drr_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
+}
+
+static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
+ [TCA_DRR_QUANTUM] = { .type = NLA_U32 },
+};
+
+static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl = (struct drr_class *)*arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_DRR_MAX + 1];
+ u32 quantum;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "DRR options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_DRR_QUANTUM]) {
+ quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
+ if (quantum == 0) {
+ NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero");
+ return -EINVAL;
+ }
+ } else
+ quantum = psched_mtu(qdisc_dev(sch));
+
+ if (cl != NULL) {
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to replace estimator");
+ return err;
+ }
+ }
+
+ sch_tree_lock(sch);
+ if (tb[TCA_DRR_QUANTUM])
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ cl->common.classid = classid;
+ cl->quantum = quantum;
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid,
+ NULL);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+ else
+ qdisc_hash_add(cl->qdisc, true);
+
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to replace estimator");
+ qdisc_put(cl->qdisc);
+ kfree(cl);
+ return err;
+ }
+ }
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+}
+
+static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
+{
+ gen_kill_estimator(&cl->rate_est);
+ qdisc_put(cl->qdisc);
+ kfree(cl);
+}
+
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (cl->filter_cnt > 0)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ drr_purge_queue(cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+ sch_tree_unlock(sch);
+
+ drr_destroy_class(sch, cl);
+ return 0;
+}
+
+static unsigned long drr_search_class(struct Qdisc *sch, u32 classid)
+{
+ return (unsigned long)drr_find_class(sch, classid);
+}
+
+static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+
+ if (cl) {
+ NL_SET_ERR_MSG(extack, "DRR classid must be zero");
+ return NULL;
+ }
+
+ return q->block;
+}
+
+static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct drr_class *cl = drr_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->filter_cnt++;
+
+ return (unsigned long)cl;
+}
+
+static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid, NULL);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ return 0;
+}
+
+static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ return cl->qdisc;
+}
+
+static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ list_del(&cl->alist);
+}
+
+static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+ __u32 qlen = cl->qdisc->q.qlen;
+ struct tc_drr_stats xstats;
+
+ memset(&xstats, 0, sizeof(xstats));
+ if (qlen)
+ xstats.deficit = cl->deficit;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+ cl = drr_find_class(sch, skb->priority);
+ if (cl != NULL)
+ return cl;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tcf_classify(skb, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct drr_class *)res.class;
+ if (cl == NULL)
+ cl = drr_find_class(sch, res.classid);
+ return cl;
+ }
+ return NULL;
+}
+
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ int err = 0;
+
+ cl = drr_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (cl->qdisc->q.qlen == 1) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *drr_dequeue(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct sk_buff *skb;
+ unsigned int len;
+
+ if (list_empty(&q->active))
+ goto out;
+ while (1) {
+ cl = list_first_entry(&q->active, struct drr_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (skb == NULL) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(skb == NULL))
+ goto out;
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+
+ bstats_update(&cl->bstats, skb);
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ int err;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+ INIT_LIST_HEAD(&q->active);
+ return 0;
+}
+
+static void drr_reset_qdisc(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->qdisc->q.qlen)
+ list_del(&cl->alist);
+ qdisc_reset(cl->qdisc);
+ }
+ }
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void drr_destroy_qdisc(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct hlist_node *next;
+ unsigned int i;
+
+ tcf_block_put(q->block);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode)
+ drr_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops drr_class_ops = {
+ .change = drr_change_class,
+ .delete = drr_delete_class,
+ .find = drr_search_class,
+ .tcf_block = drr_tcf_block,
+ .bind_tcf = drr_bind_tcf,
+ .unbind_tcf = drr_unbind_tcf,
+ .graft = drr_graft_class,
+ .leaf = drr_class_leaf,
+ .qlen_notify = drr_qlen_notify,
+ .dump = drr_dump_class,
+ .dump_stats = drr_dump_class_stats,
+ .walk = drr_walk,
+};
+
+static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
+ .cl_ops = &drr_class_ops,
+ .id = "drr",
+ .priv_size = sizeof(struct drr_sched),
+ .enqueue = drr_enqueue,
+ .dequeue = drr_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = drr_init_qdisc,
+ .reset = drr_reset_qdisc,
+ .destroy = drr_destroy_qdisc,
+ .owner = THIS_MODULE,
+};
+
+static int __init drr_init(void)
+{
+ return register_qdisc(&drr_qdisc_ops);
+}
+
+static void __exit drr_exit(void)
+{
+ unregister_qdisc(&drr_qdisc_ops);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
new file mode 100644
index 000000000..47a61689d
--- /dev/null
+++ b/net/sched/sch_dsmark.c
@@ -0,0 +1,519 @@
+/* net/sched/sch_dsmark.c - Differentiated Services field marker */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <asm/byteorder.h>
+
+/*
+ * classid class marking
+ * ------- ----- -------
+ * n/a 0 n/a
+ * x:0 1 use entry [0]
+ * ... ... ...
+ * x:y y>0 y+1 use entry [y]
+ * ... ... ...
+ * x:indices-1 indices use entry [indices-1]
+ * ... ... ...
+ * x:y y+1 use entry [y & (indices-1)]
+ * ... ... ...
+ * 0xffff 0x10000 use entry [indices-1]
+ */
+
+
+#define NO_DEFAULT_INDEX (1 << 16)
+
+struct mask_value {
+ u8 mask;
+ u8 value;
+};
+
+struct dsmark_qdisc_data {
+ struct Qdisc *q;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ struct mask_value *mv;
+ u16 indices;
+ u8 set_tc_index;
+ u32 default_index; /* index range is 0...0xffff */
+#define DSMARK_EMBEDDED_SZ 16
+ struct mask_value embedded[DSMARK_EMBEDDED_SZ];
+};
+
+static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
+{
+ return index <= p->indices && index > 0;
+}
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
+ __func__, sch, p, new, old);
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &p->q);
+ return 0;
+}
+
+static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ return p->q;
+}
+
+static unsigned long dsmark_find(struct Qdisc *sch, u32 classid)
+{
+ return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long dsmark_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+ __func__, sch, qdisc_priv(sch), classid);
+
+ return dsmark_find(sch, classid);
+}
+
+static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
+ [TCA_DSMARK_INDICES] = { .type = NLA_U16 },
+ [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
+ [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
+ [TCA_DSMARK_MASK] = { .type = NLA_U8 },
+ [TCA_DSMARK_VALUE] = { .type = NLA_U8 },
+};
+
+static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_DSMARK_MAX + 1];
+ int err = -EINVAL;
+
+ pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
+ __func__, sch, p, classid, parent, *arg);
+
+ if (!dsmark_valid_index(p, *arg)) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (!opt)
+ goto errout;
+
+ err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ if (tb[TCA_DSMARK_VALUE])
+ p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+
+ if (tb[TCA_DSMARK_MASK])
+ p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
+
+ err = 0;
+
+errout:
+ return err;
+}
+
+static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ if (!dsmark_valid_index(p, arg))
+ return -EINVAL;
+
+ p->mv[arg - 1].mask = 0xff;
+ p->mv[arg - 1].value = 0;
+
+ return 0;
+}
+
+static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ int i;
+
+ pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
+ __func__, sch, p, walker);
+
+ if (walker->stop)
+ return;
+
+ for (i = 0; i < p->indices; i++) {
+ if (p->mv[i].mask == 0xff && !p->mv[i].value)
+ goto ignore;
+ if (walker->count >= walker->skip) {
+ if (walker->fn(sch, i + 1, walker) < 0) {
+ walker->stop = 1;
+ break;
+ }
+ }
+ignore:
+ walker->count++;
+ }
+}
+
+static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ return p->block;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ int err;
+
+ pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
+
+ if (p->set_tc_index) {
+ int wlen = skb_network_offset(skb);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto drop;
+
+ skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
+ & ~INET_ECN_MASK;
+ break;
+
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto drop;
+
+ skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
+ & ~INET_ECN_MASK;
+ break;
+ default:
+ skb->tc_index = 0;
+ break;
+ }
+ }
+
+ if (TC_H_MAJ(skb->priority) == sch->handle)
+ skb->tc_index = TC_H_MIN(skb->priority);
+ else {
+ struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
+ int result = tcf_classify(skb, fl, &res, false);
+
+ pr_debug("result %d class 0x%04x\n", result, res.classid);
+
+ switch (result) {
+#ifdef CONFIG_NET_CLS_ACT
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ __qdisc_drop(skb, to_free);
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+
+ case TC_ACT_SHOT:
+ goto drop;
+#endif
+ case TC_ACT_OK:
+ skb->tc_index = TC_H_MIN(res.classid);
+ break;
+
+ default:
+ if (p->default_index != NO_DEFAULT_INDEX)
+ skb->tc_index = p->default_index;
+ break;
+ }
+ }
+
+ err = qdisc_enqueue(skb, p->q, to_free);
+ if (err != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(err))
+ qdisc_qstats_drop(sch);
+ return err;
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+
+drop:
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct sk_buff *skb;
+ u32 index;
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ skb = qdisc_dequeue_peeked(p->q);
+ if (skb == NULL)
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ index = skb->tc_index & (p->indices - 1);
+ pr_debug("index %d->%d\n", skb->tc_index, index);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask,
+ p->mv[index].value);
+ break;
+ case htons(ETH_P_IPV6):
+ ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask,
+ p->mv[index].value);
+ break;
+ default:
+ /*
+ * Only complain if a change was actually attempted.
+ * This way, we can send non-IP traffic through dsmark
+ * and don't need yet another qdisc as a bypass.
+ */
+ if (p->mv[index].mask != 0xff || p->mv[index].value)
+ pr_warn("%s: unsupported protocol %d\n",
+ __func__, ntohs(skb_protocol(skb, true)));
+ break;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *dsmark_peek(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ return p->q->ops->peek(p->q);
+}
+
+static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *tb[TCA_DSMARK_MAX + 1];
+ int err = -EINVAL;
+ u32 default_index = NO_DEFAULT_INDEX;
+ u16 indices;
+ int i;
+
+ pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
+
+ if (!opt)
+ goto errout;
+
+ err = tcf_block_get(&p->block, &p->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ if (!tb[TCA_DSMARK_INDICES])
+ goto errout;
+ indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
+
+ if (hweight32(indices) != 1)
+ goto errout;
+
+ if (tb[TCA_DSMARK_DEFAULT_INDEX])
+ default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
+
+ if (indices <= DSMARK_EMBEDDED_SZ)
+ p->mv = p->embedded;
+ else
+ p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL);
+ if (!p->mv) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ for (i = 0; i < indices; i++) {
+ p->mv[i].mask = 0xff;
+ p->mv[i].value = 0;
+ }
+ p->indices = indices;
+ p->default_index = default_index;
+ p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
+
+ p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle,
+ NULL);
+ if (p->q == NULL)
+ p->q = &noop_qdisc;
+ else
+ qdisc_hash_add(p->q, true);
+
+ pr_debug("%s: qdisc %p\n", __func__, p->q);
+
+ err = 0;
+errout:
+ return err;
+}
+
+static void dsmark_reset(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+ if (p->q)
+ qdisc_reset(p->q);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void dsmark_destroy(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ tcf_block_put(p->block);
+ qdisc_put(p->q);
+ if (p->mv != p->embedded)
+ kfree(p->mv);
+}
+
+static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+
+ pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
+
+ if (!dsmark_valid_index(p, cl))
+ return -EINVAL;
+
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
+ tcm->tcm_info = p->q->handle;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
+ nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
+ goto nla_put_failure;
+
+ if (p->default_index != NO_DEFAULT_INDEX &&
+ nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
+ goto nla_put_failure;
+
+ if (p->set_tc_index &&
+ nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops dsmark_class_ops = {
+ .graft = dsmark_graft,
+ .leaf = dsmark_leaf,
+ .find = dsmark_find,
+ .change = dsmark_change,
+ .delete = dsmark_delete,
+ .walk = dsmark_walk,
+ .tcf_block = dsmark_tcf_block,
+ .bind_tcf = dsmark_bind_filter,
+ .unbind_tcf = dsmark_unbind_filter,
+ .dump = dsmark_dump_class,
+};
+
+static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &dsmark_class_ops,
+ .id = "dsmark",
+ .priv_size = sizeof(struct dsmark_qdisc_data),
+ .enqueue = dsmark_enqueue,
+ .dequeue = dsmark_dequeue,
+ .peek = dsmark_peek,
+ .init = dsmark_init,
+ .reset = dsmark_reset,
+ .destroy = dsmark_destroy,
+ .change = NULL,
+ .dump = dsmark_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init dsmark_module_init(void)
+{
+ return register_qdisc(&dsmark_qdisc_ops);
+}
+
+static void __exit dsmark_module_exit(void)
+{
+ unregister_qdisc(&dsmark_qdisc_ops);
+}
+
+module_init(dsmark_module_init)
+module_exit(dsmark_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000..2278f3d42
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c Earliest TxTime First queueing discipline.
+ *
+ * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ * Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+
+struct etf_sched_data {
+ bool offload;
+ bool deadline_mode;
+ int clockid;
+ int queue;
+ s32 delta; /* in ns */
+ ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+ struct rb_root head;
+ struct qdisc_watchdog watchdog;
+ ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+ [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ /* Check if params comply to the following rules:
+ * * Clockid and delta must be valid.
+ *
+ * * Dynamic clockids are not supported.
+ *
+ * * Delta must be a positive integer.
+ *
+ * Also note that for the HW offload case, we must
+ * expect that system clocks have been synchronized to PHC.
+ */
+ if (qopt->clockid < 0) {
+ NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+ return -ENOTSUPP;
+ }
+
+ if (qopt->clockid != CLOCK_TAI) {
+ NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
+ return -EINVAL;
+ }
+
+ if (qopt->delta < 0) {
+ NL_SET_ERR_MSG(extack, "Delta must be positive");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ ktime_t txtime = nskb->tstamp;
+ struct sock *sk = nskb->sk;
+ ktime_t now;
+
+ if (!sk || !sk_fullsock(sk))
+ return false;
+
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return false;
+
+ /* We don't perform crosstimestamping.
+ * Drop if packet's clockid differs from qdisc's.
+ */
+ if (sk->sk_clockid != q->clockid)
+ return false;
+
+ if (sk->sk_txtime_deadline_mode != q->deadline_mode)
+ return false;
+
+ now = q->get_time();
+ if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ p = rb_first(&q->head);
+ if (!p)
+ return NULL;
+
+ return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = etf_peek_timesortedlist(sch);
+ ktime_t next;
+
+ if (!skb)
+ return;
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+ qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *clone;
+ ktime_t txtime = skb->tstamp;
+ struct sock *sk = skb->sk;
+
+ if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors))
+ return;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone)
+ return;
+
+ serr = SKB_EXT_ERR(clone);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
+ serr->ee.ee_info = txtime; /* low part of tstamp */
+
+ if (sock_queue_err_skb(sk, clone))
+ kfree_skb(clone);
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ ktime_t txtime = nskb->tstamp;
+
+ if (!is_packet_valid(sch, nskb)) {
+ report_sock_error(nskb, EINVAL,
+ SO_EE_CODE_TXTIME_INVALID_PARAM);
+ return qdisc_drop(nskb, sch, to_free);
+ }
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (ktime_after(txtime, skb->tstamp))
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->head);
+
+ qdisc_qstats_backlog_inc(sch, nskb);
+ sch->q.qlen++;
+
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+ bool drop)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ rb_erase(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+
+ if (drop) {
+ struct sk_buff *to_free = NULL;
+
+ report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
+ qdisc_drop(skb, sch, &to_free);
+ kfree_skb_list(to_free);
+ qdisc_qstats_overlimit(sch);
+ } else {
+ qdisc_bstats_update(sch, skb);
+
+ q->last = skb->tstamp;
+ }
+
+ sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ ktime_t now, next;
+
+ skb = etf_peek_timesortedlist(sch);
+ if (!skb)
+ return NULL;
+
+ now = q->get_time();
+
+ /* Drop if packet has expired while in queue. */
+ if (ktime_before(skb->tstamp, now)) {
+ timesortedlist_erase(sch, skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ /* When in deadline mode, dequeue as soon as possible and change the
+ * txtime from deadline to (now + delta).
+ */
+ if (q->deadline_mode) {
+ timesortedlist_erase(sch, skb, false);
+ skb->tstamp = now;
+ goto out;
+ }
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+
+ /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+ if (ktime_after(now, next))
+ timesortedlist_erase(sch, skb, false);
+ else
+ skb = NULL;
+
+out:
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return skb;
+}
+
+static void etf_disable_offload(struct net_device *dev,
+ struct etf_sched_data *q)
+{
+ struct tc_etf_qopt_offload etf = { };
+ const struct net_device_ops *ops;
+ int err;
+
+ if (!q->offload)
+ return;
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_setup_tc)
+ return;
+
+ etf.queue = q->queue;
+ etf.enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0)
+ pr_warn("Couldn't disable ETF offload for queue %d\n",
+ etf.queue);
+}
+
+static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_etf_qopt_offload etf = { };
+ int err;
+
+ if (q->offload)
+ return 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
+ return -EOPNOTSUPP;
+ }
+
+ etf.queue = q->queue;
+ etf.enable = 1;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
+ return err;
+ }
+
+ return 0;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_ETF_MAX + 1];
+ struct tc_etf_qopt *qopt;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack,
+ "Missing ETF qdisc options which are mandatory");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETF_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+ return -EINVAL;
+ }
+
+ qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+ pr_debug("delta %d clockid %d offload %s deadline %s\n",
+ qopt->delta, qopt->clockid,
+ OFFLOAD_IS_ON(qopt) ? "on" : "off",
+ DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+ err = validate_input_params(qopt, extack);
+ if (err < 0)
+ return err;
+
+ q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+ if (OFFLOAD_IS_ON(qopt)) {
+ err = etf_enable_offload(dev, q, extack);
+ if (err < 0)
+ return err;
+ }
+
+ /* Everything went OK, save the parameters used. */
+ q->delta = qopt->delta;
+ q->clockid = qopt->clockid;
+ q->offload = OFFLOAD_IS_ON(qopt);
+ q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+ switch (q->clockid) {
+ case CLOCK_REALTIME:
+ q->get_time = ktime_get_real;
+ break;
+ case CLOCK_MONOTONIC:
+ q->get_time = ktime_get;
+ break;
+ case CLOCK_BOOTTIME:
+ q->get_time = ktime_get_boottime;
+ break;
+ case CLOCK_TAI:
+ q->get_time = ktime_get_clocktai;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Clockid is not supported");
+ return -ENOTSUPP;
+ }
+
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+ return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p = rb_first(&q->head);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+
+ rb_erase(&skb->rbnode, &q->head);
+ rtnl_kfree_skbs(skb, skb);
+ sch->q.qlen--;
+ }
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ /* No matter which mode we are on, it's safe to clear both lists. */
+ timesortedlist_clear(sch);
+ __qdisc_reset_queue(&sch->q);
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ etf_disable_offload(dev, q);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct tc_etf_qopt opt = { };
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ opt.delta = q->delta;
+ opt.clockid = q->clockid;
+ if (q->offload)
+ opt.flags |= TC_ETF_OFFLOAD_ON;
+
+ if (q->deadline_mode)
+ opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+ if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+ .id = "etf",
+ .priv_size = sizeof(struct etf_sched_data),
+ .enqueue = etf_enqueue_timesortedlist,
+ .dequeue = etf_dequeue_timesortedlist,
+ .peek = etf_peek_timesortedlist,
+ .init = etf_init,
+ .reset = etf_reset,
+ .destroy = etf_destroy,
+ .dump = etf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+ return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+ unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000..3697cd799
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,190 @@
+/*
+ * net/sched/sch_fifo.c The simplest FIFO queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/* 1 band FIFO pseudo-"scheduler" */
+
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ if (likely(sch->q.qlen < sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ unsigned int prev_backlog;
+
+ if (likely(sch->q.qlen < sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ prev_backlog = sch->qstats.backlog;
+ /* queue full, remove one skb to fulfill the limit */
+ __qdisc_queue_drop_head(sch, &sch->q, to_free);
+ qdisc_qstats_drop(sch);
+ qdisc_enqueue_tail(skb, sch);
+
+ qdisc_tree_reduce_backlog(sch, 0, prev_backlog - sch->qstats.backlog);
+ return NET_XMIT_CN;
+}
+
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
+
+ if (opt == NULL) {
+ u32 limit = qdisc_dev(sch)->tx_queue_len;
+
+ if (is_bfifo)
+ limit *= psched_mtu(qdisc_dev(sch));
+
+ sch->limit = limit;
+ } else {
+ struct tc_fifo_qopt *ctl = nla_data(opt);
+
+ if (nla_len(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ sch->limit = ctl->limit;
+ }
+
+ if (is_bfifo)
+ bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = sch->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_fifo_qopt opt = { .limit = sch->limit };
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
+ .id = "pfifo",
+ .priv_size = 0,
+ .enqueue = pfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(pfifo_qdisc_ops);
+
+struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
+ .id = "bfifo",
+ .priv_size = 0,
+ .enqueue = bfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(bfifo_qdisc_ops);
+
+struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
+ .id = "pfifo_head_drop",
+ .priv_size = 0,
+ .enqueue = pfifo_tail_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+
+/* Pass size change message down to embedded FIFO */
+int fifo_set_limit(struct Qdisc *q, unsigned int limit)
+{
+ struct nlattr *nla;
+ int ret = -ENOMEM;
+
+ /* Hack to avoid sending change message to non-FIFO */
+ if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+ return 0;
+
+ if (!q->ops->change)
+ return 0;
+
+ nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+ if (nla) {
+ nla->nla_type = RTM_NEWQDISC;
+ nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
+ ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
+
+ ret = q->ops->change(q, nla, NULL);
+ kfree(nla);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(fifo_set_limit);
+
+struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
+ unsigned int limit,
+ struct netlink_ext_ack *extack)
+{
+ struct Qdisc *q;
+ int err = -ENOMEM;
+
+ q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1),
+ extack);
+ if (q) {
+ err = fifo_set_limit(q, limit);
+ if (err < 0) {
+ qdisc_put(q);
+ q = NULL;
+ }
+ }
+
+ return q ? : ERR_PTR(err);
+}
+EXPORT_SYMBOL(fifo_create_dflt);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 000000000..ba60a8dd5
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,936 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Meant to be mostly used for locally generated traffic :
+ * Fast classification depends on skb->sk being set before reaching us.
+ * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ * All packets belonging to a socket are considered as a 'flow'.
+ *
+ * Flows are dynamically allocated and stored in a hash table of RB trees
+ * They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ * Burst avoidance (aka pacing) capability :
+ *
+ * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ * bunch of packets, and this packet scheduler adds delay between
+ * packets to respect rate limitation.
+ *
+ * enqueue() :
+ * - lookup one RB tree (out of 1024 or more) to find the flow.
+ * If non existent flow, create it, add it to the tree.
+ * Add skb to the per flow list of skb (fifo).
+ * - Use a special fifo for high prio packets
+ *
+ * dequeue() : serves flows in Round Robin
+ * Note : When a flow becomes empty, we do not immediately remove it from
+ * rb trees, for performance reasons (its expected to send additional packets,
+ * or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+ struct sk_buff *head; /* list of skbs for this flow : first skb */
+ union {
+ struct sk_buff *tail; /* last skb in the list */
+ unsigned long age; /* jiffies when flow was emptied, for gc */
+ };
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ struct sock *sk;
+ int qlen; /* number of packets in flow queue */
+ int credit;
+ u32 socket_hash; /* sk_hash */
+ struct fq_flow *next; /* next pointer in RR lists, or &detached */
+
+ struct rb_node rate_node; /* anchor in q->delayed tree */
+ u64 time_next_packet;
+};
+
+struct fq_flow_head {
+ struct fq_flow *first;
+ struct fq_flow *last;
+};
+
+struct fq_sched_data {
+ struct fq_flow_head new_flows;
+
+ struct fq_flow_head old_flows;
+
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
+
+ struct fq_flow internal; /* for non classified or high prio packets */
+ u32 quantum;
+ u32 initial_quantum;
+ u32 flow_refill_delay;
+ u32 flow_max_rate; /* optional max rate per flow */
+ u32 flow_plimit; /* max packets per flow */
+ u32 orphan_mask; /* mask for orphaned skb */
+ u32 low_rate_threshold;
+ struct rb_root *fq_root;
+ u8 rate_enable;
+ u8 fq_trees_log;
+
+ u32 flows;
+ u32 inactive_flows;
+ u32 throttled_flows;
+
+ u64 stat_gc_flows;
+ u64 stat_internal_packets;
+ u64 stat_tcp_retrans;
+ u64 stat_throttled;
+ u64 stat_flows_plimit;
+ u64 stat_pkts_too_long;
+ u64 stat_allocation_errors;
+ struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+ f->next = &detached;
+ f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+ return f->next == &detached;
+}
+
+static bool fq_flow_is_throttled(const struct fq_flow *f)
+{
+ return f->next == &throttled;
+}
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ rb_erase(&f->rate_node, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct fq_flow *aux;
+
+ parent = *p;
+ aux = rb_entry(parent, struct fq_flow, rate_node);
+ if (f->time_next_packet >= aux->time_next_packet)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&f->rate_node, parent, p);
+ rb_insert_color(&f->rate_node, &q->delayed);
+ q->throttled_flows++;
+ q->stat_throttled++;
+
+ f->next = &throttled;
+ if (q->time_next_delayed_flow > f->time_next_packet)
+ q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+ return fq_flow_is_detached(f) &&
+ time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+ struct rb_root *root,
+ struct sock *sk)
+{
+ struct fq_flow *f, *tofree[FQ_GC_MAX];
+ struct rb_node **p, *parent;
+ int fcnt = 0;
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = rb_entry(parent, struct fq_flow, fq_node);
+ if (f->sk == sk)
+ break;
+
+ if (fq_gc_candidate(f)) {
+ tofree[fcnt++] = f;
+ if (fcnt == FQ_GC_MAX)
+ break;
+ }
+
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+ while (fcnt) {
+ struct fq_flow *f = tofree[--fcnt];
+
+ rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+ struct rb_node **p, *parent;
+ struct sock *sk = skb->sk;
+ struct rb_root *root;
+ struct fq_flow *f;
+
+ /* warning: no starvation prevention... */
+ if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+ return &q->internal;
+
+ /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
+ * or a listener (SYNCOOKIE mode)
+ * 1) request sockets are not full blown,
+ * they do not contain sk_pacing_rate
+ * 2) They are not part of a 'flow' yet
+ * 3) We do not want to rate limit them (eg SYNFLOOD attack),
+ * especially if the listener set SO_MAX_PACING_RATE
+ * 4) We pretend they are orphaned
+ */
+ if (!sk || sk_listener(sk)) {
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+
+ /* By forcing low order bit to 1, we make sure to not
+ * collide with a local flow (socket pointers are word aligned)
+ */
+ sk = (struct sock *)((hash << 1) | 1UL);
+ skb_orphan(skb);
+ }
+
+ root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
+
+ if (q->flows >= (2U << q->fq_trees_log) &&
+ q->inactive_flows > q->flows/2)
+ fq_gc(q, root, sk);
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = rb_entry(parent, struct fq_flow, fq_node);
+ if (f->sk == sk) {
+ /* socket might have been reallocated, so check
+ * if its sk_hash is the same.
+ * It not, we need to refill credit with
+ * initial quantum
+ */
+ if (unlikely(skb->sk &&
+ f->socket_hash != sk->sk_hash)) {
+ f->credit = q->initial_quantum;
+ f->socket_hash = sk->sk_hash;
+ if (fq_flow_is_throttled(f))
+ fq_flow_unset_throttled(q, f);
+ f->time_next_packet = 0ULL;
+ }
+ return f;
+ }
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!f)) {
+ q->stat_allocation_errors++;
+ return &q->internal;
+ }
+ fq_flow_set_detached(f);
+ f->sk = sk;
+ if (skb->sk)
+ f->socket_hash = sk->sk_hash;
+ f->credit = q->initial_quantum;
+
+ rb_link_node(&f->fq_node, parent, p);
+ rb_insert_color(&f->fq_node, root);
+
+ q->flows++;
+ q->inactive_flows++;
+ return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ flow->qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+ return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head-> [retrans pkt 1]
+ * [retrans pkt 2]
+ * [ normal pkt 1]
+ * [ normal pkt 2]
+ * [ normal pkt 3]
+ * tail-> [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *head = flow->head;
+
+ skb->next = NULL;
+ if (!head) {
+ flow->head = skb;
+ flow->tail = skb;
+ return;
+ }
+ if (likely(!skb_is_retransmit(skb))) {
+ flow->tail->next = skb;
+ flow->tail = skb;
+ return;
+ }
+
+ /* This skb is a tcp retransmit,
+ * find the last retrans packet in the queue
+ */
+ prev = NULL;
+ while (skb_is_retransmit(head)) {
+ prev = head;
+ head = head->next;
+ if (!head)
+ break;
+ }
+ if (!prev) { /* no rtx packet in queue, become the new head */
+ skb->next = flow->head;
+ flow->head = skb;
+ } else {
+ if (prev == flow->tail)
+ flow->tail = skb;
+ else
+ skb->next = prev->next;
+ prev->next = skb;
+ }
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_flow *f;
+
+ if (unlikely(sch->q.qlen >= sch->limit))
+ return qdisc_drop(skb, sch, to_free);
+
+ f = fq_classify(skb, q);
+ if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ f->qlen++;
+ if (skb_is_retransmit(skb))
+ q->stat_tcp_retrans++;
+ qdisc_qstats_backlog_inc(sch, skb);
+ if (fq_flow_is_detached(f)) {
+ struct sock *sk = skb->sk;
+
+ fq_flow_add_tail(&q->new_flows, f);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
+ if (sk && q->rate_enable) {
+ if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
+ SK_PACING_FQ))
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
+ }
+ q->inactive_flows--;
+ }
+
+ /* Note: this overwrites f->age */
+ flow_queue_add(f, skb);
+
+ if (unlikely(f == &q->internal)) {
+ q->stat_internal_packets++;
+ }
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+ unsigned long sample;
+ struct rb_node *p;
+
+ if (q->time_next_delayed_flow > now)
+ return;
+
+ /* Update unthrottle latency EWMA.
+ * This is cheap and can help diagnosing timer/latency problems.
+ */
+ sample = (unsigned long)(now - q->time_next_delayed_flow);
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+
+ q->time_next_delayed_flow = ~0ULL;
+ while ((p = rb_first(&q->delayed)) != NULL) {
+ struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
+
+ if (f->time_next_packet > now) {
+ q->time_next_delayed_flow = f->time_next_packet;
+ break;
+ }
+ fq_flow_unset_throttled(q, f);
+ }
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_get_ns();
+ struct fq_flow_head *head;
+ struct sk_buff *skb;
+ struct fq_flow *f;
+ u32 rate, plen;
+
+ skb = fq_dequeue_head(sch, &q->internal);
+ if (skb)
+ goto out;
+ fq_check_throttled(q, now);
+begin:
+ head = &q->new_flows;
+ if (!head->first) {
+ head = &q->old_flows;
+ if (!head->first) {
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ q->time_next_delayed_flow);
+ return NULL;
+ }
+ }
+ f = head->first;
+
+ if (f->credit <= 0) {
+ f->credit += q->quantum;
+ head->first = f->next;
+ fq_flow_add_tail(&q->old_flows, f);
+ goto begin;
+ }
+
+ skb = f->head;
+ if (unlikely(skb && now < f->time_next_packet &&
+ !skb_is_tcp_pure_ack(skb))) {
+ head->first = f->next;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+
+ skb = fq_dequeue_head(sch, f);
+ if (!skb) {
+ head->first = f->next;
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && q->old_flows.first) {
+ fq_flow_add_tail(&q->old_flows, f);
+ } else {
+ fq_flow_set_detached(f);
+ q->inactive_flows++;
+ }
+ goto begin;
+ }
+ prefetch(&skb->end);
+ f->credit -= qdisc_pkt_len(skb);
+
+ if (!q->rate_enable)
+ goto out;
+
+ /* Do not pace locally generated ack packets */
+ if (skb_is_tcp_pure_ack(skb))
+ goto out;
+
+ rate = q->flow_max_rate;
+ if (skb->sk)
+ rate = min(skb->sk->sk_pacing_rate, rate);
+
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ plen = qdisc_pkt_len(skb);
+ } else {
+ plen = max(qdisc_pkt_len(skb), q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
+ if (rate != ~0U) {
+ u64 len = (u64)plen * NSEC_PER_SEC;
+
+ if (likely(rate))
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 1 second.
+ * Really, providers of too big packets should be fixed !
+ */
+ if (unlikely(len > NSEC_PER_SEC)) {
+ len = NSEC_PER_SEC;
+ q->stat_pkts_too_long++;
+ }
+ /* Account for schedule/timers drifts.
+ * f->time_next_packet was set when prior packet was sent,
+ * and current time (@now) can be too late by tens of us.
+ */
+ if (f->time_next_packet)
+ len -= min(len/2, now - f->time_next_packet);
+ f->time_next_packet = now + len;
+ }
+out:
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static void fq_flow_purge(struct fq_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+ flow->qlen = 0;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
+ struct rb_node *p;
+ struct fq_flow *f;
+ unsigned int idx;
+
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+
+ fq_flow_purge(&q->internal);
+
+ if (!q->fq_root)
+ return;
+
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ f = rb_entry(p, struct fq_flow, fq_node);
+ rb_erase(p, root);
+
+ fq_flow_purge(f);
+
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+ }
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->flows = 0;
+ q->inactive_flows = 0;
+ q->throttled_flows = 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+ struct rb_root *old_array, u32 old_log,
+ struct rb_root *new_array, u32 new_log)
+{
+ struct rb_node *op, **np, *parent;
+ struct rb_root *oroot, *nroot;
+ struct fq_flow *of, *nf;
+ int fcnt = 0;
+ u32 idx;
+
+ for (idx = 0; idx < (1U << old_log); idx++) {
+ oroot = &old_array[idx];
+ while ((op = rb_first(oroot)) != NULL) {
+ rb_erase(op, oroot);
+ of = rb_entry(op, struct fq_flow, fq_node);
+ if (fq_gc_candidate(of)) {
+ fcnt++;
+ kmem_cache_free(fq_flow_cachep, of);
+ continue;
+ }
+ nroot = &new_array[hash_ptr(of->sk, new_log)];
+
+ np = &nroot->rb_node;
+ parent = NULL;
+ while (*np) {
+ parent = *np;
+
+ nf = rb_entry(parent, struct fq_flow, fq_node);
+ BUG_ON(nf->sk == of->sk);
+
+ if (nf->sk > of->sk)
+ np = &parent->rb_right;
+ else
+ np = &parent->rb_left;
+ }
+
+ rb_link_node(&of->fq_node, parent, np);
+ rb_insert_color(&of->fq_node, nroot);
+ }
+ }
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+}
+
+static void fq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int fq_resize(struct Qdisc *sch, u32 log)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *array;
+ void *old_fq_root;
+ u32 idx;
+
+ if (q->fq_root && log == q->fq_trees_log)
+ return 0;
+
+ /* If XPS was setup, we can allocate memory on right NUMA node */
+ array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
+ netdev_queue_numa_node_read(sch->dev_queue));
+ if (!array)
+ return -ENOMEM;
+
+ for (idx = 0; idx < (1U << log); idx++)
+ array[idx] = RB_ROOT;
+
+ sch_tree_lock(sch);
+
+ old_fq_root = q->fq_root;
+ if (old_fq_root)
+ fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
+
+ q->fq_root = array;
+ q->fq_trees_log = log;
+
+ sch_tree_unlock(sch);
+
+ fq_free(old_fq_root);
+
+ return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
+ [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_MAX + 1];
+ int err, drop_count = 0;
+ unsigned drop_len = 0;
+ u32 fq_log;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ fq_log = q->fq_trees_log;
+
+ if (tb[TCA_FQ_BUCKETS_LOG]) {
+ u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+ if (nval >= 1 && nval <= ilog2(256*1024))
+ fq_log = nval;
+ else
+ err = -EINVAL;
+ }
+ if (tb[TCA_FQ_PLIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+ if (tb[TCA_FQ_FLOW_PLIMIT])
+ q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+ if (tb[TCA_FQ_QUANTUM]) {
+ u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+ if (quantum > 0 && quantum <= (1 << 20)) {
+ q->quantum = quantum;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
+ err = -EINVAL;
+ }
+ }
+
+ if (tb[TCA_FQ_INITIAL_QUANTUM])
+ q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+ if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+ pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+ nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+ if (tb[TCA_FQ_FLOW_MAX_RATE])
+ q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+ if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+ q->low_rate_threshold =
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
+ if (tb[TCA_FQ_RATE_ENABLE]) {
+ u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+ if (enable <= 1)
+ q->rate_enable = enable;
+ else
+ err = -EINVAL;
+ }
+
+ if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+ u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+ q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+ }
+
+ if (tb[TCA_FQ_ORPHAN_MASK])
+ q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+
+ if (!err) {
+ sch_tree_unlock(sch);
+ err = fq_resize(sch, fq_log);
+ sch_tree_lock(sch);
+ }
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_dequeue(sch);
+
+ if (!skb)
+ break;
+ drop_len += qdisc_pkt_len(skb);
+ rtnl_kfree_skbs(skb, skb);
+ drop_count++;
+ }
+ qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
+
+ sch_tree_unlock(sch);
+ return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+
+ fq_reset(sch);
+ fq_free(q->fq_root);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ sch->limit = 10000;
+ q->flow_plimit = 100;
+ q->quantum = 2 * psched_mtu(qdisc_dev(sch));
+ q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
+ q->flow_refill_delay = msecs_to_jiffies(40);
+ q->flow_max_rate = ~0U;
+ q->time_next_delayed_flow = ~0ULL;
+ q->rate_enable = 1;
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->fq_root = NULL;
+ q->fq_trees_log = ilog2(1024);
+ q->orphan_mask = 1024 - 1;
+ q->low_rate_threshold = 550000 / 8;
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt)
+ err = fq_change(sch, opt, extack);
+ else
+ err = fq_resize(sch, q->fq_trees_log);
+
+ return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+ jiffies_to_usecs(q->flow_refill_delay)) ||
+ nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+ q->low_rate_threshold) ||
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_qd_stats st;
+
+ sch_tree_lock(sch);
+
+ st.gc_flows = q->stat_gc_flows;
+ st.highprio_packets = q->stat_internal_packets;
+ st.tcp_retrans = q->stat_tcp_retrans;
+ st.throttled = q->stat_throttled;
+ st.flows_plimit = q->stat_flows_plimit;
+ st.pkts_too_long = q->stat_pkts_too_long;
+ st.allocation_errors = q->stat_allocation_errors;
+ st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.flows = q->flows;
+ st.inactive_flows = q->inactive_flows;
+ st.throttled_flows = q->throttled_flows;
+ st.unthrottle_latency_ns = min_t(unsigned long,
+ q->unthrottle_latency_ns, ~0U);
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+ .id = "fq",
+ .priv_size = sizeof(struct fq_sched_data),
+
+ .enqueue = fq_enqueue,
+ .dequeue = fq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_init,
+ .reset = fq_reset,
+ .destroy = fq_destroy,
+ .change = fq_change,
+ .dump = fq_dump,
+ .dump_stats = fq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+ int ret;
+
+ fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+ sizeof(struct fq_flow),
+ 0, 0, NULL);
+ if (!fq_flow_cachep)
+ return -ENOMEM;
+
+ ret = register_qdisc(&fq_qdisc_ops);
+ if (ret)
+ kmem_cache_destroy(fq_flow_cachep);
+ return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+ unregister_qdisc(&fq_qdisc_ops);
+ kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 000000000..e4f69c779
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,744 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
+
+/* Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ * might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ int deficit;
+ u32 dropped; /* number of drops (or ECN marks) on this flow */
+ struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
+ u32 *backlogs; /* backlog table [flows_cnt] */
+ u32 flows_cnt; /* number of flows */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_batch_size;
+ u32 memory_limit;
+ struct codel_params cparams;
+ struct codel_stats cstats;
+ u32 memory_usage;
+ u32 drop_overmemory;
+ u32 drop_overlimit;
+ u32 new_flow_count;
+
+ struct list_head new_flows; /* list of new flows */
+ struct list_head old_flows; /* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+ struct sk_buff *skb)
+{
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_codel_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+ struct sk_buff *skb)
+{
+ if (flow->head == NULL)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
+ struct sk_buff **to_free)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ unsigned int maxbacklog = 0, idx = 0, i, len;
+ struct fq_codel_flow *flow;
+ unsigned int threshold;
+ unsigned int mem = 0;
+
+ /* Queue is full! Find the fat flow and drop packet(s) from it.
+ * This might sound expensive, but with 1024 flows, we scan
+ * 4KB of memory, and we dont need to handle a complex tree
+ * in fast path (packet queue/enqueue) with many cache misses.
+ * In stress mode, we'll try to drop 64 packets from the flow,
+ * amortizing this linear lookup to one cache line per drop.
+ */
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (q->backlogs[i] > maxbacklog) {
+ maxbacklog = q->backlogs[i];
+ idx = i;
+ }
+ }
+
+ /* Our goal is to drop half of this fat flow backlog */
+ threshold = maxbacklog >> 1;
+
+ flow = &q->flows[idx];
+ len = 0;
+ i = 0;
+ do {
+ skb = dequeue_head(flow);
+ len += qdisc_pkt_len(skb);
+ mem += get_codel_cb(skb)->mem_usage;
+ __qdisc_drop(skb, to_free);
+ } while (++i < max_packets && len < threshold);
+
+ flow->dropped += i;
+ q->backlogs[idx] -= len;
+ q->memory_usage -= mem;
+ sch->qstats.drops += i;
+ sch->qstats.backlog -= len;
+ sch->q.qlen -= i;
+ return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int idx, prev_backlog, prev_qlen;
+ struct fq_codel_flow *flow;
+ int uninitialized_var(ret);
+ unsigned int pkt_len;
+ bool memory_limited;
+
+ idx = fq_codel_classify(skb, sch, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+
+ codel_set_enqueue_time(skb);
+ flow = &q->flows[idx];
+ flow_queue_add(flow, skb);
+ q->backlogs[idx] += qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ if (list_empty(&flow->flowchain)) {
+ list_add_tail(&flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ flow->deficit = q->quantum;
+ flow->dropped = 0;
+ }
+ get_codel_cb(skb)->mem_usage = skb->truesize;
+ q->memory_usage += get_codel_cb(skb)->mem_usage;
+ memory_limited = q->memory_usage > q->memory_limit;
+ if (++sch->q.qlen <= sch->limit && !memory_limited)
+ return NET_XMIT_SUCCESS;
+
+ prev_backlog = sch->qstats.backlog;
+ prev_qlen = sch->q.qlen;
+
+ /* save this packet length as it might be dropped by fq_codel_drop() */
+ pkt_len = qdisc_pkt_len(skb);
+ /* fq_codel_drop() is quite expensive, as it performs a linear search
+ * in q->backlogs[] to find a fat flow.
+ * So instead of dropping a single packet, drop half of its backlog
+ * with a 64 packets limit to not add a too big cpu spike here.
+ */
+ ret = fq_codel_drop(sch, q->drop_batch_size, to_free);
+
+ prev_qlen -= sch->q.qlen;
+ prev_backlog -= sch->qstats.backlog;
+ q->drop_overlimit += prev_qlen;
+ if (memory_limited)
+ q->drop_overmemory += prev_qlen;
+
+ /* As we dropped packet(s), better let upper stack know this.
+ * If we dropped a packet for this flow, return NET_XMIT_CN,
+ * but in this case, our parents wont increase their backlogs.
+ */
+ if (ret == idx) {
+ qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+ prev_backlog - pkt_len);
+ return NET_XMIT_CN;
+ }
+ qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
+ return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct fq_codel_flow *flow;
+ struct sk_buff *skb = NULL;
+
+ flow = container_of(vars, struct fq_codel_flow, cvars);
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ q->memory_usage -= get_codel_cb(skb)->mem_usage;
+ sch->q.qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ }
+ return skb;
+}
+
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+ struct Qdisc *sch = ctx;
+
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct fq_codel_flow *flow;
+ struct list_head *head;
+ u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+ flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ prev_drop_count = q->cstats.drop_count;
+ prev_ecn_mark = q->cstats.ecn_mark;
+
+ skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
+ &flow->cvars, &q->cstats, qdisc_pkt_len,
+ codel_get_enqueue_time, drop_func, dequeue_func);
+
+ flow->dropped += q->cstats.drop_count - prev_drop_count;
+ flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ flow->deficit -= qdisc_pkt_len(skb);
+ /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->cstats.drop_count && sch->q.qlen) {
+ qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
+ q->cstats.drop_len);
+ q->cstats.drop_count = 0;
+ q->cstats.drop_len = 0;
+ }
+ return skb;
+}
+
+static void fq_codel_flow_purge(struct fq_codel_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (i = 0; i < q->flows_cnt; i++) {
+ struct fq_codel_flow *flow = q->flows + i;
+
+ fq_codel_flow_purge(flow);
+ INIT_LIST_HEAD(&flow->flowchain);
+ codel_vars_init(&flow->cvars);
+ }
+ memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+ q->memory_usage = 0;
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+ [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+ u32 quantum = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy,
+ NULL);
+ if (err < 0)
+ return err;
+ if (tb[TCA_FQ_CODEL_FLOWS]) {
+ if (q->flows)
+ return -EINVAL;
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+ if (!q->flows_cnt ||
+ q->flows_cnt > 65536)
+ return -EINVAL;
+ }
+ if (tb[TCA_FQ_CODEL_QUANTUM]) {
+ quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+ if (quantum > FQ_CODEL_QUANTUM_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid quantum");
+ return -EINVAL;
+ }
+ }
+ sch_tree_lock(sch);
+
+ if (tb[TCA_FQ_CODEL_TARGET]) {
+ u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+ q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) {
+ u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]);
+
+ q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_INTERVAL]) {
+ u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+ q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+ if (tb[TCA_FQ_CODEL_ECN])
+ q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+ if (quantum)
+ q->quantum = quantum;
+
+ if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+ q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
+ if (tb[TCA_FQ_CODEL_MEMORY_LIMIT])
+ q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]));
+
+ while (sch->q.qlen > sch->limit ||
+ q->memory_usage > q->memory_limit) {
+ struct sk_buff *skb = fq_codel_dequeue(sch);
+
+ q->cstats.drop_len += qdisc_pkt_len(skb);
+ rtnl_kfree_skbs(skb, skb);
+ q->cstats.drop_count++;
+ }
+ qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
+ q->cstats.drop_count = 0;
+ q->cstats.drop_len = 0;
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ kvfree(q->backlogs);
+ kvfree(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ int i;
+ int err;
+
+ sch->limit = 10*1024;
+ q->flows_cnt = 1024;
+ q->memory_limit = 32 << 20; /* 32 MBytes */
+ q->drop_batch_size = 64;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ codel_params_init(&q->cparams);
+ codel_stats_init(&q->cstats);
+ q->cparams.ecn = true;
+ q->cparams.mtu = psched_mtu(qdisc_dev(sch));
+
+ if (opt) {
+ err = fq_codel_change(sch, opt, extack);
+ if (err)
+ goto init_failure;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ goto init_failure;
+
+ if (!q->flows) {
+ q->flows = kvcalloc(q->flows_cnt,
+ sizeof(struct fq_codel_flow),
+ GFP_KERNEL);
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
+ q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL);
+ if (!q->backlogs) {
+ err = -ENOMEM;
+ goto alloc_failure;
+ }
+ for (i = 0; i < q->flows_cnt; i++) {
+ struct fq_codel_flow *flow = q->flows + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ codel_vars_init(&flow->cvars);
+ }
+ }
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+
+alloc_failure:
+ kvfree(q->flows);
+ q->flows = NULL;
+init_failure:
+ q->flows_cnt = 0;
+ return err;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+ codel_time_to_us(q->cparams.target)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+ codel_time_to_us(q->cparams.interval)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+ q->cparams.ecn) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+ q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+ q->drop_batch_size) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT,
+ q->memory_limit) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+ q->flows_cnt))
+ goto nla_put_failure;
+
+ if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+ nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
+ codel_time_to_us(q->cparams.ce_threshold)))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_codel_xstats st = {
+ .type = TCA_FQ_CODEL_XSTATS_QDISC,
+ };
+ struct list_head *pos;
+
+ st.qdisc_stats.maxpacket = q->cstats.maxpacket;
+ st.qdisc_stats.drop_overlimit = q->drop_overlimit;
+ st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
+ st.qdisc_stats.new_flow_count = q->new_flow_count;
+ st.qdisc_stats.ce_mark = q->cstats.ce_mark;
+ st.qdisc_stats.memory_usage = q->memory_usage;
+ st.qdisc_stats.drop_overmemory = q->drop_overmemory;
+
+ sch_tree_lock(sch);
+ list_for_each(pos, &q->new_flows)
+ st.qdisc_stats.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.qdisc_stats.old_flows_len++;
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void fq_codel_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ u32 idx = cl - 1;
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_fq_codel_xstats xstats;
+
+ if (idx < q->flows_cnt) {
+ const struct fq_codel_flow *flow = &q->flows[idx];
+ const struct sk_buff *skb;
+
+ memset(&xstats, 0, sizeof(xstats));
+ xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+ xstats.class_stats.deficit = flow->deficit;
+ xstats.class_stats.ldelay =
+ codel_time_to_us(flow->cvars.ldelay);
+ xstats.class_stats.count = flow->cvars.count;
+ xstats.class_stats.lastcount = flow->cvars.lastcount;
+ xstats.class_stats.dropping = flow->cvars.dropping;
+ if (flow->cvars.dropping) {
+ codel_tdiff_t delta = flow->cvars.drop_next -
+ codel_get_time();
+
+ xstats.class_stats.drop_next = (delta >= 0) ?
+ codel_time_to_us(delta) :
+ -codel_time_to_us(-delta);
+ }
+ if (flow->head) {
+ sch_tree_lock(sch);
+ skb = flow->head;
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ sch_tree_unlock(sch);
+ }
+ qs.backlog = q->backlogs[idx];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ if (idx < q->flows_cnt)
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+ return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (list_empty(&q->flows[i].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+ .leaf = fq_codel_leaf,
+ .find = fq_codel_find,
+ .tcf_block = fq_codel_tcf_block,
+ .bind_tcf = fq_codel_bind,
+ .unbind_tcf = fq_codel_unbind,
+ .dump = fq_codel_dump_class,
+ .dump_stats = fq_codel_dump_class_stats,
+ .walk = fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+ .cl_ops = &fq_codel_class_ops,
+ .id = "fq_codel",
+ .priv_size = sizeof(struct fq_codel_sched_data),
+ .enqueue = fq_codel_enqueue,
+ .dequeue = fq_codel_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_codel_init,
+ .reset = fq_codel_reset,
+ .destroy = fq_codel_destroy,
+ .change = fq_codel_change,
+ .dump = fq_codel_dump,
+ .dump_stats = fq_codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+ return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+ unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000..7c1b1eff8
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,1447 @@
+/*
+ * net/sched/sch_generic.c Generic packet scheduler routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
+ * - Ingress support
+ */
+
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <linux/skb_array.h>
+#include <linux/if_macvlan.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <trace/events/qdisc.h>
+#include <net/xfrm.h>
+
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
+/* Main transmission queue. */
+
+/* Modifications to data participating in scheduling must be protected with
+ * qdisc_lock(qdisc) spinlock.
+ *
+ * The idea is the following:
+ * - enqueue, dequeue are serialized via qdisc root lock
+ * - ingress filtering is also serialized via qdisc root lock
+ * - updates to tree and tree walking are only done under the rtnl mutex.
+ */
+
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
+static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
+{
+ const struct netdev_queue *txq = q->dev_queue;
+ spinlock_t *lock = NULL;
+ struct sk_buff *skb;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ skb = skb_peek(&q->skb_bad_txq);
+ if (skb) {
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ skb = __skb_dequeue(&q->skb_bad_txq);
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_dec(q, skb);
+ qdisc_qstats_atomic_qlen_dec(q);
+ } else {
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ }
+ } else {
+ skb = SKB_XOFF_MAGIC;
+ }
+ }
+
+ if (lock)
+ spin_unlock(lock);
+
+ return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
+{
+ struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
+
+ if (unlikely(skb))
+ skb = __skb_dequeue_bad_txq(q);
+
+ return skb;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
+ struct sk_buff *skb)
+{
+ spinlock_t *lock = NULL;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ __skb_queue_tail(&q->skb_bad_txq, skb);
+
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_atomic_qlen_inc(q);
+ } else {
+ qdisc_qstats_backlog_inc(q, skb);
+ q->q.qlen++;
+ }
+
+ if (lock)
+ spin_unlock(lock);
+}
+
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ __skb_queue_tail(&q->gso_skb, skb);
+ q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
+ q->q.qlen++; /* it's still part of the queue */
+
+ skb = next;
+ }
+ __netif_schedule(q);
+
+ return 0;
+}
+
+static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
+{
+ spinlock_t *lock = qdisc_lock(q);
+
+ spin_lock(lock);
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ __skb_queue_tail(&q->gso_skb, skb);
+
+ qdisc_qstats_cpu_requeues_inc(q);
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_atomic_qlen_inc(q);
+
+ skb = next;
+ }
+ spin_unlock(lock);
+
+ __netif_schedule(q);
+
+ return 0;
+}
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+ if (q->flags & TCQ_F_NOLOCK)
+ return dev_requeue_skb_locked(skb, q);
+ else
+ return __dev_requeue_skb(skb, q);
+}
+
+static void try_bulk_dequeue_skb(struct Qdisc *q,
+ struct sk_buff *skb,
+ const struct netdev_queue *txq,
+ int *packets)
+{
+ int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
+
+ while (bytelimit > 0) {
+ struct sk_buff *nskb = q->dequeue(q);
+
+ if (!nskb)
+ break;
+
+ bytelimit -= nskb->len; /* covers GSO len */
+ skb->next = nskb;
+ skb = nskb;
+ (*packets)++; /* GSO counts as one pkt */
+ }
+ skb->next = NULL;
+}
+
+/* This variant of try_bulk_dequeue_skb() makes sure
+ * all skbs in the chain are for the same txq
+ */
+static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
+ struct sk_buff *skb,
+ int *packets)
+{
+ int mapping = skb_get_queue_mapping(skb);
+ struct sk_buff *nskb;
+ int cnt = 0;
+
+ do {
+ nskb = q->dequeue(q);
+ if (!nskb)
+ break;
+ if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
+ qdisc_enqueue_skb_bad_txq(q, nskb);
+ break;
+ }
+ skb->next = nskb;
+ skb = nskb;
+ } while (++cnt < 8);
+ (*packets) += cnt;
+ skb->next = NULL;
+}
+
+/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
+ * A requeued skb (via q->gso_skb) can also be a SKB list.
+ */
+static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
+ int *packets)
+{
+ const struct netdev_queue *txq = q->dev_queue;
+ struct sk_buff *skb = NULL;
+
+ *packets = 1;
+ if (unlikely(!skb_queue_empty(&q->gso_skb))) {
+ spinlock_t *lock = NULL;
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
+ }
+
+ skb = skb_peek(&q->gso_skb);
+
+ /* skb may be null if another cpu pulls gso_skb off in between
+ * empty check and lock.
+ */
+ if (!skb) {
+ if (lock)
+ spin_unlock(lock);
+ goto validate;
+ }
+
+ /* skb in gso_skb were already validated */
+ *validate = false;
+ if (xfrm_offload(skb))
+ *validate = true;
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ skb = __skb_dequeue(&q->gso_skb);
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_backlog_dec(q, skb);
+ qdisc_qstats_atomic_qlen_dec(q);
+ } else {
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ }
+ } else {
+ skb = NULL;
+ }
+ if (lock)
+ spin_unlock(lock);
+ goto trace;
+ }
+validate:
+ *validate = true;
+
+ if ((q->flags & TCQ_F_ONETXQUEUE) &&
+ netif_xmit_frozen_or_stopped(txq))
+ return skb;
+
+ skb = qdisc_dequeue_skb_bad_txq(q);
+ if (unlikely(skb)) {
+ if (skb == SKB_XOFF_MAGIC)
+ return NULL;
+ goto bulk;
+ }
+ skb = q->dequeue(q);
+ if (skb) {
+bulk:
+ if (qdisc_may_bulk(q))
+ try_bulk_dequeue_skb(q, skb, txq, packets);
+ else
+ try_bulk_dequeue_skb_slow(q, skb, packets);
+ }
+trace:
+ trace_qdisc_dequeue(q, txq, *packets, skb);
+ return skb;
+}
+
+/*
+ * Transmit possibly several skbs, and handle the return status as
+ * required. Owning running seqcount bit guarantees that
+ * only one CPU can execute this function.
+ *
+ * Returns to the caller:
+ * false - hardware queue frozen backoff
+ * true - feel free to send more pkts
+ */
+bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev, struct netdev_queue *txq,
+ spinlock_t *root_lock, bool validate)
+{
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ /* And release qdisc */
+ if (root_lock)
+ spin_unlock(root_lock);
+
+ /* Note that we validate skb (GSO, checksum, ...) outside of locks */
+ if (validate)
+ skb = validate_xmit_skb_list(skb, dev, &again);
+
+#ifdef CONFIG_XFRM_OFFLOAD
+ if (unlikely(again)) {
+ if (root_lock)
+ spin_lock(root_lock);
+
+ dev_requeue_skb(skb, q);
+ return false;
+ }
+#endif
+
+ if (likely(skb)) {
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_stopped(txq))
+ skb = dev_hard_start_xmit(skb, dev, txq, &ret);
+
+ HARD_TX_UNLOCK(dev, txq);
+ } else {
+ if (root_lock)
+ spin_lock(root_lock);
+ return true;
+ }
+
+ if (root_lock)
+ spin_lock(root_lock);
+
+ if (!dev_xmit_complete(ret)) {
+ /* Driver returned NETDEV_TX_BUSY - requeue skb */
+ if (unlikely(ret != NETDEV_TX_BUSY))
+ net_warn_ratelimited("BUG %s code %d qlen %d\n",
+ dev->name, ret, q->q.qlen);
+
+ dev_requeue_skb(skb, q);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * NOTE: Called under qdisc_lock(q) with locally disabled BH.
+ *
+ * running seqcount guarantees only one CPU can process
+ * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
+ * this queue.
+ *
+ * netif_tx_lock serializes accesses to device driver.
+ *
+ * qdisc_lock(q) and netif_tx_lock are mutually exclusive,
+ * if one is grabbed, another must be free.
+ *
+ * Note, that this procedure can be called by a watchdog timer
+ *
+ * Returns to the caller:
+ * 0 - queue is empty or throttled.
+ * >0 - queue is not empty.
+ *
+ */
+static inline bool qdisc_restart(struct Qdisc *q, int *packets)
+{
+ spinlock_t *root_lock = NULL;
+ struct netdev_queue *txq;
+ struct net_device *dev;
+ struct sk_buff *skb;
+ bool validate;
+
+ /* Dequeue packet */
+ skb = dequeue_skb(q, &validate, packets);
+ if (unlikely(!skb))
+ return false;
+
+ if (!(q->flags & TCQ_F_NOLOCK))
+ root_lock = qdisc_lock(q);
+
+ dev = qdisc_dev(q);
+ txq = skb_get_tx_queue(dev, skb);
+
+ return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
+}
+
+void __qdisc_run(struct Qdisc *q)
+{
+ int quota = dev_tx_weight;
+ int packets;
+
+ while (qdisc_restart(q, &packets)) {
+ /*
+ * Ordered by possible occurrence: Postpone processing if
+ * 1. we've exceeded packet quota
+ * 2. another process needs the CPU;
+ */
+ quota -= packets;
+ if (quota <= 0 || need_resched()) {
+ __netif_schedule(q);
+ break;
+ }
+ }
+}
+
+unsigned long dev_trans_start(struct net_device *dev)
+{
+ unsigned long val, res;
+ unsigned int i;
+
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
+ else if (netif_is_macvlan(dev))
+ dev = macvlan_dev_real_dev(dev);
+ res = netdev_get_tx_queue(dev, 0)->trans_start;
+ for (i = 1; i < dev->num_tx_queues; i++) {
+ val = netdev_get_tx_queue(dev, i)->trans_start;
+ if (val && time_after(val, res))
+ res = val;
+ }
+
+ return res;
+}
+EXPORT_SYMBOL(dev_trans_start);
+
+static void dev_watchdog(struct timer_list *t)
+{
+ struct net_device *dev = from_timer(dev, t, watchdog_timer);
+
+ netif_tx_lock(dev);
+ if (!qdisc_tx_is_noop(dev)) {
+ if (netif_device_present(dev) &&
+ netif_running(dev) &&
+ netif_carrier_ok(dev)) {
+ int some_queue_timedout = 0;
+ unsigned int i;
+ unsigned long trans_start;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(dev, i);
+ trans_start = txq->trans_start;
+ if (netif_xmit_stopped(txq) &&
+ time_after(jiffies, (trans_start +
+ dev->watchdog_timeo))) {
+ some_queue_timedout = 1;
+ txq->trans_timeout++;
+ break;
+ }
+ }
+
+ if (some_queue_timedout) {
+ WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
+ dev->name, netdev_drivername(dev), i);
+ dev->netdev_ops->ndo_tx_timeout(dev);
+ }
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies +
+ dev->watchdog_timeo)))
+ dev_hold(dev);
+ }
+ }
+ netif_tx_unlock(dev);
+
+ dev_put(dev);
+}
+
+void __netdev_watchdog_up(struct net_device *dev)
+{
+ if (dev->netdev_ops->ndo_tx_timeout) {
+ if (dev->watchdog_timeo <= 0)
+ dev->watchdog_timeo = 5*HZ;
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies + dev->watchdog_timeo)))
+ dev_hold(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
+
+static void dev_watchdog_up(struct net_device *dev)
+{
+ __netdev_watchdog_up(dev);
+}
+
+static void dev_watchdog_down(struct net_device *dev)
+{
+ netif_tx_lock_bh(dev);
+ if (del_timer(&dev->watchdog_timer))
+ dev_put(dev);
+ netif_tx_unlock_bh(dev);
+}
+
+/**
+ * netif_carrier_on - set carrier
+ * @dev: network device
+ *
+ * Device has detected that carrier.
+ */
+void netif_carrier_on(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ return;
+ atomic_inc(&dev->carrier_up_count);
+ linkwatch_fire_event(dev);
+ if (netif_running(dev))
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_carrier_on);
+
+/**
+ * netif_carrier_off - clear carrier
+ * @dev: network device
+ *
+ * Device has detected loss of carrier.
+ */
+void netif_carrier_off(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ return;
+ atomic_inc(&dev->carrier_down_count);
+ linkwatch_fire_event(dev);
+ }
+}
+EXPORT_SYMBOL(netif_carrier_off);
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+ under all circumstances. It is difficult to invent anything faster or
+ cheaper.
+ */
+
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
+{
+ __qdisc_drop(skb, to_free);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
+{
+ return NULL;
+}
+
+struct Qdisc_ops noop_qdisc_ops __read_mostly = {
+ .id = "noop",
+ .priv_size = 0,
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .peek = noop_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static struct netdev_queue noop_netdev_queue = {
+ .qdisc = &noop_qdisc,
+ .qdisc_sleeping = &noop_qdisc,
+};
+
+struct Qdisc noop_qdisc = {
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .flags = TCQ_F_BUILTIN,
+ .ops = &noop_qdisc_ops,
+ .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
+ .dev_queue = &noop_netdev_queue,
+ .running = SEQCNT_ZERO(noop_qdisc.running),
+ .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
+ .gso_skb = {
+ .next = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
+ },
+ .skb_bad_txq = {
+ .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
+ },
+};
+EXPORT_SYMBOL(noop_qdisc);
+
+static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ /* register_qdisc() assigns a default of noop_enqueue if unset,
+ * but __dev_queue_xmit() treats noqueue only as such
+ * if this is NULL - so clear it here. */
+ qdisc->enqueue = NULL;
+ return 0;
+}
+
+struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
+ .id = "noqueue",
+ .priv_size = 0,
+ .init = noqueue_init,
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .peek = noop_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/* 3-band FIFO queue: old style, but should be a bit faster than
+ generic prio+fifo combination.
+ */
+
+#define PFIFO_FAST_BANDS 3
+
+/*
+ * Private data for a pfifo_fast scheduler containing:
+ * - rings for priority bands
+ */
+struct pfifo_fast_priv {
+ struct skb_array q[PFIFO_FAST_BANDS];
+};
+
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+ int band)
+{
+ return &priv->q[band];
+}
+
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
+{
+ int band = prio2band[skb->priority & TC_PRIO_MAX];
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ struct skb_array *q = band2list(priv, band);
+ unsigned int pkt_len = qdisc_pkt_len(skb);
+ int err;
+
+ err = skb_array_produce(q, skb);
+
+ if (unlikely(err))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+
+ qdisc_qstats_atomic_qlen_inc(qdisc);
+ /* Note: skb can not be used after skb_array_produce(),
+ * so we better not use qdisc_qstats_cpu_backlog_inc()
+ */
+ this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ struct sk_buff *skb = NULL;
+ int band;
+
+ for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+ struct skb_array *q = band2list(priv, band);
+
+ if (__skb_array_empty(q))
+ continue;
+
+ skb = __skb_array_consume(q);
+ }
+ if (likely(skb)) {
+ qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+ qdisc_bstats_cpu_update(qdisc, skb);
+ qdisc_qstats_atomic_qlen_dec(qdisc);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ struct sk_buff *skb = NULL;
+ int band;
+
+ for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+ struct skb_array *q = band2list(priv, band);
+
+ skb = __skb_array_peek(q);
+ }
+
+ return skb;
+}
+
+static void pfifo_fast_reset(struct Qdisc *qdisc)
+{
+ int i, band;
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+
+ for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+ struct skb_array *q = band2list(priv, band);
+ struct sk_buff *skb;
+
+ /* NULL ring is possible if destroy path is due to a failed
+ * skb_array_init() in pfifo_fast_init() case.
+ */
+ if (!q->ring.queue)
+ continue;
+
+ while ((skb = __skb_array_consume(q)) != NULL)
+ kfree_skb(skb);
+ }
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+
+ q->backlog = 0;
+ }
+}
+
+static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
+{
+ struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
+
+ memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ int prio;
+
+ /* guard against zero length rings */
+ if (!qlen)
+ return -EINVAL;
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ struct skb_array *q = band2list(priv, prio);
+ int err;
+
+ err = skb_array_init(q, qlen, GFP_KERNEL);
+ if (err)
+ return -ENOMEM;
+ }
+
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static void pfifo_fast_destroy(struct Qdisc *sch)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ struct skb_array *q = band2list(priv, prio);
+
+ /* NULL ring is possible if destroy path is due to a failed
+ * skb_array_init() in pfifo_fast_init() case.
+ */
+ if (!q->ring.queue)
+ continue;
+ /* Destroy ring but no need to kfree_skb because a call to
+ * pfifo_fast_reset() has already done that work.
+ */
+ ptr_ring_cleanup(&q->ring, NULL);
+ }
+}
+
+static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
+ unsigned int new_len)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(sch);
+ struct skb_array *bands[PFIFO_FAST_BANDS];
+ int prio;
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ struct skb_array *q = band2list(priv, prio);
+
+ bands[prio] = q;
+ }
+
+ return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
+ GFP_KERNEL);
+}
+
+struct Qdisc_ops pfifo_fast_ops __read_mostly = {
+ .id = "pfifo_fast",
+ .priv_size = sizeof(struct pfifo_fast_priv),
+ .enqueue = pfifo_fast_enqueue,
+ .dequeue = pfifo_fast_dequeue,
+ .peek = pfifo_fast_peek,
+ .init = pfifo_fast_init,
+ .destroy = pfifo_fast_destroy,
+ .reset = pfifo_fast_reset,
+ .dump = pfifo_fast_dump,
+ .change_tx_queue_len = pfifo_fast_change_tx_queue_len,
+ .owner = THIS_MODULE,
+ .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
+};
+EXPORT_SYMBOL(pfifo_fast_ops);
+
+static struct lock_class_key qdisc_tx_busylock;
+static struct lock_class_key qdisc_running_key;
+
+struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
+ const struct Qdisc_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ void *p;
+ struct Qdisc *sch;
+ unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+ int err = -ENOBUFS;
+ struct net_device *dev;
+
+ if (!dev_queue) {
+ NL_SET_ERR_MSG(extack, "No device queue given");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ dev = dev_queue->dev;
+ p = kzalloc_node(size, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ /* if we got non aligned memory, ask more and do alignment ourself */
+ if (sch != p) {
+ kfree(p);
+ p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
+ }
+ __skb_queue_head_init(&sch->gso_skb);
+ __skb_queue_head_init(&sch->skb_bad_txq);
+ qdisc_skb_head_init(&sch->q);
+ spin_lock_init(&sch->q.lock);
+
+ if (ops->static_flags & TCQ_F_CPUSTATS) {
+ sch->cpu_bstats =
+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!sch->cpu_bstats)
+ goto errout1;
+
+ sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!sch->cpu_qstats) {
+ free_percpu(sch->cpu_bstats);
+ goto errout1;
+ }
+ }
+
+ spin_lock_init(&sch->busylock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
+ /* seqlock has the same scope of busylock, for NOLOCK qdisc */
+ spin_lock_init(&sch->seqlock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
+ seqcount_init(&sch->running);
+ lockdep_set_class(&sch->running,
+ dev->qdisc_running_key ?: &qdisc_running_key);
+
+ sch->ops = ops;
+ sch->flags = ops->static_flags;
+ sch->enqueue = ops->enqueue;
+ sch->dequeue = ops->dequeue;
+ sch->dev_queue = dev_queue;
+ dev_hold(dev);
+ refcount_set(&sch->refcnt, 1);
+
+ return sch;
+errout1:
+ kfree(p);
+errout:
+ return ERR_PTR(err);
+}
+
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+ const struct Qdisc_ops *ops,
+ unsigned int parentid,
+ struct netlink_ext_ack *extack)
+{
+ struct Qdisc *sch;
+
+ if (!try_module_get(ops->owner)) {
+ NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
+ return NULL;
+ }
+
+ sch = qdisc_alloc(dev_queue, ops, extack);
+ if (IS_ERR(sch)) {
+ module_put(ops->owner);
+ return NULL;
+ }
+ sch->parent = parentid;
+
+ if (!ops->init || ops->init(sch, NULL, extack) == 0)
+ return sch;
+
+ qdisc_put(sch);
+ return NULL;
+}
+EXPORT_SYMBOL(qdisc_create_dflt);
+
+/* Under qdisc_lock(qdisc) and BH! */
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+ const struct Qdisc_ops *ops = qdisc->ops;
+ struct sk_buff *skb, *tmp;
+
+ if (ops->reset)
+ ops->reset(qdisc);
+
+ skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+ __skb_unlink(skb, &qdisc->gso_skb);
+ kfree_skb_list(skb);
+ }
+
+ skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+ __skb_unlink(skb, &qdisc->skb_bad_txq);
+ kfree_skb_list(skb);
+ }
+
+ qdisc->q.qlen = 0;
+ qdisc->qstats.backlog = 0;
+}
+EXPORT_SYMBOL(qdisc_reset);
+
+void qdisc_free(struct Qdisc *qdisc)
+{
+ if (qdisc_is_percpu_stats(qdisc)) {
+ free_percpu(qdisc->cpu_bstats);
+ free_percpu(qdisc->cpu_qstats);
+ }
+
+ kfree((char *) qdisc - qdisc->padded);
+}
+
+static void qdisc_free_cb(struct rcu_head *head)
+{
+ struct Qdisc *q = container_of(head, struct Qdisc, rcu);
+
+ qdisc_free(q);
+}
+
+static void qdisc_destroy(struct Qdisc *qdisc)
+{
+ const struct Qdisc_ops *ops;
+ struct sk_buff *skb, *tmp;
+
+ if (!qdisc)
+ return;
+ ops = qdisc->ops;
+
+#ifdef CONFIG_NET_SCHED
+ qdisc_hash_del(qdisc);
+
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
+#endif
+ gen_kill_estimator(&qdisc->rate_est);
+ if (ops->reset)
+ ops->reset(qdisc);
+ if (ops->destroy)
+ ops->destroy(qdisc);
+
+ module_put(ops->owner);
+ dev_put(qdisc_dev(qdisc));
+
+ skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
+ __skb_unlink(skb, &qdisc->gso_skb);
+ kfree_skb_list(skb);
+ }
+
+ skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
+ __skb_unlink(skb, &qdisc->skb_bad_txq);
+ kfree_skb_list(skb);
+ }
+
+ call_rcu(&qdisc->rcu, qdisc_free_cb);
+}
+
+void qdisc_put(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_test(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
+
+/* Version of qdisc_put() that is called with rtnl mutex unlocked.
+ * Intended to be used as optimization, this function only takes rtnl lock if
+ * qdisc reference counter reached zero.
+ */
+
+void qdisc_put_unlocked(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(qdisc_put_unlocked);
+
+/* Attach toplevel qdisc to device queue. */
+struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+ struct Qdisc *qdisc)
+{
+ struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+ spinlock_t *root_lock;
+
+ root_lock = qdisc_lock(oqdisc);
+ spin_lock_bh(root_lock);
+
+ /* ... and graft new one */
+ if (qdisc == NULL)
+ qdisc = &noop_qdisc;
+ dev_queue->qdisc_sleeping = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+
+ spin_unlock_bh(root_lock);
+
+ return oqdisc;
+}
+EXPORT_SYMBOL(dev_graft_qdisc);
+
+static void attach_one_default_qdisc(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ struct Qdisc *qdisc;
+ const struct Qdisc_ops *ops = default_qdisc_ops;
+
+ if (dev->priv_flags & IFF_NO_QUEUE)
+ ops = &noqueue_qdisc_ops;
+
+ qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
+ if (!qdisc) {
+ netdev_info(dev, "activation failed\n");
+ return;
+ }
+ if (!netif_is_multiqueue(dev))
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ dev_queue->qdisc_sleeping = qdisc;
+}
+
+static void attach_default_qdiscs(struct net_device *dev)
+{
+ struct netdev_queue *txq;
+ struct Qdisc *qdisc;
+
+ txq = netdev_get_tx_queue(dev, 0);
+
+ if (!netif_is_multiqueue(dev) ||
+ dev->priv_flags & IFF_NO_QUEUE) {
+ netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+ dev->qdisc = txq->qdisc_sleeping;
+ qdisc_refcount_inc(dev->qdisc);
+ } else {
+ qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
+ if (qdisc) {
+ dev->qdisc = qdisc;
+ qdisc->ops->attach(qdisc);
+ }
+ }
+#ifdef CONFIG_NET_SCHED
+ if (dev->qdisc != &noop_qdisc)
+ qdisc_hash_add(dev->qdisc, false);
+#endif
+}
+
+static void transition_one_qdisc(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_need_watchdog)
+{
+ struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+ int *need_watchdog_p = _need_watchdog;
+
+ if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+ clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
+
+ rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+ if (need_watchdog_p) {
+ dev_queue->trans_start = 0;
+ *need_watchdog_p = 1;
+ }
+}
+
+void dev_activate(struct net_device *dev)
+{
+ int need_watchdog;
+
+ /* No queueing discipline is attached to device;
+ * create default one for devices, which need queueing
+ * and noqueue_qdisc for virtual interfaces
+ */
+
+ if (dev->qdisc == &noop_qdisc)
+ attach_default_qdiscs(dev);
+
+ if (!netif_carrier_ok(dev))
+ /* Delay activation until next carrier-on event */
+ return;
+
+ need_watchdog = 0;
+ netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
+ if (dev_ingress_queue(dev))
+ transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
+
+ if (need_watchdog) {
+ netif_trans_update(dev);
+ dev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(dev_activate);
+
+static void dev_deactivate_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc_default)
+{
+ struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc);
+ struct Qdisc *qdisc_default = _qdisc_default;
+
+ if (qdisc) {
+ if (!(qdisc->flags & TCQ_F_BUILTIN))
+ set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+
+ rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ }
+}
+
+static void dev_reset_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ struct Qdisc *qdisc;
+ bool nolock;
+
+ qdisc = dev_queue->qdisc_sleeping;
+ if (!qdisc)
+ return;
+
+ nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ qdisc_reset(qdisc);
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock)
+ spin_unlock_bh(&qdisc->seqlock);
+}
+
+static bool some_qdisc_is_busy(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *dev_queue;
+ spinlock_t *root_lock;
+ struct Qdisc *q;
+ int val;
+
+ dev_queue = netdev_get_tx_queue(dev, i);
+ q = dev_queue->qdisc_sleeping;
+
+ root_lock = qdisc_lock(q);
+ spin_lock_bh(root_lock);
+
+ val = (qdisc_is_running(q) ||
+ test_bit(__QDISC_STATE_SCHED, &q->state));
+
+ spin_unlock_bh(root_lock);
+
+ if (val)
+ return true;
+ }
+ return false;
+}
+
+static void dev_qdisc_reset(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *none)
+{
+ struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+ if (qdisc)
+ qdisc_reset(qdisc);
+}
+
+/**
+ * dev_deactivate_many - deactivate transmissions on several devices
+ * @head: list of devices to deactivate
+ *
+ * This function returns only when all outstanding transmissions
+ * have completed, unless all devices are in dismantle phase.
+ */
+void dev_deactivate_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ list_for_each_entry(dev, head, close_list) {
+ netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+ &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_deactivate_queue(dev, dev_ingress_queue(dev),
+ &noop_qdisc);
+
+ dev_watchdog_down(dev);
+ }
+
+ /* Wait for outstanding qdisc-less dev_queue_xmit calls or
+ * outstanding qdisc enqueuing calls.
+ * This is avoided if all devices are in dismantle phase :
+ * Caller will call synchronize_net() for us
+ */
+ synchronize_net();
+
+ list_for_each_entry(dev, head, close_list) {
+ netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
+
+ if (dev_ingress_queue(dev))
+ dev_reset_queue(dev, dev_ingress_queue(dev), NULL);
+ }
+
+ /* Wait for outstanding qdisc_run calls. */
+ list_for_each_entry(dev, head, close_list) {
+ while (some_qdisc_is_busy(dev))
+ yield();
+ /* The new qdisc is assigned at this point so we can safely
+ * unwind stale skb lists and qdisc statistics
+ */
+ netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+ if (dev_ingress_queue(dev))
+ dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+ }
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ dev_deactivate_many(&single);
+ list_del(&single);
+}
+EXPORT_SYMBOL(dev_deactivate);
+
+static int qdisc_change_tx_queue_len(struct net_device *dev,
+ struct netdev_queue *dev_queue)
+{
+ struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+ const struct Qdisc_ops *ops = qdisc->ops;
+
+ if (ops->change_tx_queue_len)
+ return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
+ return 0;
+}
+
+void dev_qdisc_change_real_num_tx(struct net_device *dev,
+ unsigned int new_real_tx)
+{
+ struct Qdisc *qdisc = dev->qdisc;
+
+ if (qdisc->ops->change_real_num_tx)
+ qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
+}
+
+int dev_qdisc_change_tx_queue_len(struct net_device *dev)
+{
+ bool up = dev->flags & IFF_UP;
+ unsigned int i;
+ int ret = 0;
+
+ if (up)
+ dev_deactivate(dev);
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
+
+ /* TODO: revert changes on a partial failure */
+ if (ret)
+ break;
+ }
+
+ if (up)
+ dev_activate(dev);
+ return ret;
+}
+
+static void dev_init_scheduler_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc)
+{
+ struct Qdisc *qdisc = _qdisc;
+
+ rcu_assign_pointer(dev_queue->qdisc, qdisc);
+ dev_queue->qdisc_sleeping = qdisc;
+}
+
+void dev_init_scheduler(struct net_device *dev)
+{
+ dev->qdisc = &noop_qdisc;
+ netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+
+ timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
+}
+
+static void shutdown_scheduler_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc_default)
+{
+ struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+ struct Qdisc *qdisc_default = _qdisc_default;
+
+ if (qdisc) {
+ rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ dev_queue->qdisc_sleeping = qdisc_default;
+
+ qdisc_put(qdisc);
+ }
+}
+
+void dev_shutdown(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+ qdisc_put(dev->qdisc);
+ dev->qdisc = &noop_qdisc;
+
+ WARN_ON(timer_pending(&dev->watchdog_timer));
+}
+
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+ const struct tc_ratespec *conf,
+ u64 rate64)
+{
+ memset(r, 0, sizeof(*r));
+ r->overhead = conf->overhead;
+ r->mpu = conf->mpu;
+ r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+ r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
+ r->mult = 1;
+ /*
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ */
+ if (r->rate_bytes_ps > 0) {
+ u64 factor = NSEC_PER_SEC;
+
+ for (;;) {
+ r->mult = div64_u64(factor, r->rate_bytes_ps);
+ if (r->mult & (1U << 31) || factor & (1ULL << 63))
+ break;
+ factor <<= 1;
+ r->shift++;
+ }
+ }
+}
+EXPORT_SYMBOL(psched_ratecfg_precompute);
+
+static void mini_qdisc_rcu_func(struct rcu_head *head)
+{
+}
+
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+ struct tcf_proto *tp_head)
+{
+ struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
+ struct mini_Qdisc *miniq;
+
+ if (!tp_head) {
+ RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
+ /* Wait for flying RCU callback before it is freed. */
+ rcu_barrier_bh();
+ return;
+ }
+
+ miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
+ &miniqp->miniq1 : &miniqp->miniq2;
+
+ /* We need to make sure that readers won't see the miniq
+ * we are about to modify. So wait until previous call_rcu_bh callback
+ * is done.
+ */
+ rcu_barrier_bh();
+ miniq->filter_list = tp_head;
+ rcu_assign_pointer(*miniqp->p_miniq, miniq);
+
+ if (miniq_old)
+ /* This is counterpart of the rcu barriers above. We need to
+ * block potential new user of miniq_old until all readers
+ * are not seeing it.
+ */
+ call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
+}
+EXPORT_SYMBOL(mini_qdisc_pair_swap);
+
+void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
+ struct mini_Qdisc __rcu **p_miniq)
+{
+ miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
+ miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
+ miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
+ miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
+ miniqp->p_miniq = p_miniq;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_init);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
new file mode 100644
index 000000000..db7a91a29
--- /dev/null
+++ b/net/sched/sch_gred.c
@@ -0,0 +1,620 @@
+/*
+ * net/sched/sch_gred.c Generic Random Early Detection queue.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
+ *
+ * 991129: - Bug fix with grio mode
+ * - a better sing. AvgQ mode with Grio(WRED)
+ * - A finer grained VQ dequeue based on sugestion
+ * from Ren Liu
+ * - More error checks
+ *
+ * For all the glorious comments look at include/net/red.h
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/red.h>
+
+#define GRED_DEF_PRIO (MAX_DPs / 2)
+#define GRED_VQ_MASK (MAX_DPs - 1)
+
+struct gred_sched_data;
+struct gred_sched;
+
+struct gred_sched_data {
+ u32 limit; /* HARD maximal queue length */
+ u32 DP; /* the drop parameters */
+ u32 bytesin; /* bytes seen on virtualQ so far*/
+ u32 packetsin; /* packets seen on virtualQ so far*/
+ u32 backlog; /* bytes on the virtualQ */
+ u8 prio; /* the prio of this vq */
+
+ struct red_parms parms;
+ struct red_vars vars;
+ struct red_stats stats;
+};
+
+enum {
+ GRED_WRED_MODE = 1,
+ GRED_RIO_MODE,
+};
+
+struct gred_sched {
+ struct gred_sched_data *tab[MAX_DPs];
+ unsigned long flags;
+ u32 red_flags;
+ u32 DPs;
+ u32 def;
+ struct red_vars wred_set;
+};
+
+static inline int gred_wred_mode(struct gred_sched *table)
+{
+ return test_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_enable_wred_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+ return test_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ int i;
+
+ /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
+ for (i = 0; i < table->DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ int n;
+
+ if (q == NULL)
+ continue;
+
+ for (n = i + 1; n < table->DPs; n++)
+ if (table->tab[n] && table->tab[n]->prio == q->prio)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline unsigned int gred_backlog(struct gred_sched *table,
+ struct gred_sched_data *q,
+ struct Qdisc *sch)
+{
+ if (gred_wred_mode(table))
+ return sch->qstats.backlog;
+ else
+ return q->backlog;
+}
+
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+ return skb->tc_index & GRED_VQ_MASK;
+}
+
+static inline void gred_load_wred_set(const struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ q->vars.qavg = table->wred_set.qavg;
+ q->vars.qidlestart = table->wred_set.qidlestart;
+}
+
+static inline void gred_store_wred_set(struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ table->wred_set.qavg = q->vars.qavg;
+ table->wred_set.qidlestart = q->vars.qidlestart;
+}
+
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_ECN;
+}
+
+static inline int gred_use_harddrop(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_HARDDROP;
+}
+
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct gred_sched_data *q = NULL;
+ struct gred_sched *t = qdisc_priv(sch);
+ unsigned long qavg = 0;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ dp = t->def;
+
+ q = t->tab[dp];
+ if (!q) {
+ /* Pass through packets not assigned to a DP
+ * if no default DP has been configured. This
+ * allows for DP flows to be left untouched.
+ */
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <=
+ sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+ else
+ goto drop;
+ }
+
+ /* fix tc_index? --could be controversial but needed for
+ requeueing */
+ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
+ }
+
+ /* sum up all the qaves of prios < ours to get the new qave */
+ if (!gred_wred_mode(t) && gred_rio_mode(t)) {
+ int i;
+
+ for (i = 0; i < t->DPs; i++) {
+ if (t->tab[i] && t->tab[i]->prio < q->prio &&
+ !red_is_idling(&t->tab[i]->vars))
+ qavg += t->tab[i]->vars.qavg;
+ }
+
+ }
+
+ q->packetsin++;
+ q->bytesin += qdisc_pkt_len(skb);
+
+ if (gred_wred_mode(t))
+ gred_load_wred_set(t, q);
+
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ gred_backlog(t, q, sch));
+
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ if (gred_wred_mode(t))
+ gred_store_wred_set(t, q);
+
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
+ }
+
+ if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) {
+ q->backlog += qdisc_pkt_len(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+ q->stats.pdrop++;
+drop:
+ return qdisc_drop(skb, sch, to_free);
+
+congestion_drop:
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct gred_sched *t = qdisc_priv(sch);
+
+ skb = qdisc_dequeue_head(sch);
+
+ if (skb) {
+ struct gred_sched_data *q;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
+ tc_index_to_dp(skb));
+ } else {
+ q->backlog -= qdisc_pkt_len(skb);
+
+ if (gred_wred_mode(t)) {
+ if (!sch->qstats.backlog)
+ red_start_of_idle_period(&t->wred_set);
+ } else {
+ if (!q->backlog)
+ red_start_of_idle_period(&q->vars);
+ }
+ }
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static void gred_reset(struct Qdisc *sch)
+{
+ int i;
+ struct gred_sched *t = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+
+ for (i = 0; i < t->DPs; i++) {
+ struct gred_sched_data *q = t->tab[i];
+
+ if (!q)
+ continue;
+
+ red_restart(&q->vars);
+ q->backlog = 0;
+ }
+}
+
+static inline void gred_destroy_vq(struct gred_sched_data *q)
+{
+ kfree(q);
+}
+
+static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_sopt *sopt;
+ int i;
+
+ if (!dps)
+ return -EINVAL;
+
+ sopt = nla_data(dps);
+
+ if (sopt->DPs > MAX_DPs || sopt->DPs == 0 ||
+ sopt->def_DP >= sopt->DPs)
+ return -EINVAL;
+
+ sch_tree_lock(sch);
+ table->DPs = sopt->DPs;
+ table->def = sopt->def_DP;
+ table->red_flags = sopt->flags;
+
+ /*
+ * Every entry point to GRED is synchronized with the above code
+ * and the DP is checked against DPs, i.e. shadowed VQs can no
+ * longer be found so we can unlock right here.
+ */
+ sch_tree_unlock(sch);
+
+ if (sopt->grio) {
+ gred_enable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
+ } else {
+ gred_disable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ }
+
+ for (i = table->DPs; i < MAX_DPs; i++) {
+ if (table->tab[i]) {
+ pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
+ i);
+ gred_destroy_vq(table->tab[i]);
+ table->tab[i] = NULL;
+ }
+ }
+
+ return 0;
+}
+
+static inline int gred_change_vq(struct Qdisc *sch, int dp,
+ struct tc_gred_qopt *ctl, int prio,
+ u8 *stab, u32 max_P,
+ struct gred_sched_data **prealloc)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct gred_sched_data *q = table->tab[dp];
+
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
+ return -EINVAL;
+
+ if (!q) {
+ table->tab[dp] = q = *prealloc;
+ *prealloc = NULL;
+ if (!q)
+ return -ENOMEM;
+ }
+
+ q->DP = dp;
+ q->prio = prio;
+ if (ctl->limit > sch->limit)
+ q->limit = sch->limit;
+ else
+ q->limit = ctl->limit;
+
+ if (q->backlog == 0)
+ red_end_of_idle_period(&q->vars);
+
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
+ ctl->Scell_log, stab, max_P);
+ red_set_vars(&q->vars);
+ return 0;
+}
+
+static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
+ [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
+ [TCA_GRED_STAB] = { .len = 256 },
+ [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
+ [TCA_GRED_MAX_P] = { .type = NLA_U32 },
+ [TCA_GRED_LIMIT] = { .type = NLA_U32 },
+};
+
+static int gred_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_qopt *ctl;
+ struct nlattr *tb[TCA_GRED_MAX + 1];
+ int err, prio = GRED_DEF_PRIO;
+ u8 *stab;
+ u32 max_P;
+ struct gred_sched_data *prealloc;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) {
+ if (tb[TCA_GRED_LIMIT] != NULL)
+ sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+ }
+
+ if (tb[TCA_GRED_PARMS] == NULL ||
+ tb[TCA_GRED_STAB] == NULL ||
+ tb[TCA_GRED_LIMIT] != NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+
+ err = -EINVAL;
+ ctl = nla_data(tb[TCA_GRED_PARMS]);
+ stab = nla_data(tb[TCA_GRED_STAB]);
+
+ if (ctl->DP >= table->DPs)
+ goto errout;
+
+ if (gred_rio_mode(table)) {
+ if (ctl->prio == 0) {
+ int def_prio = GRED_DEF_PRIO;
+
+ if (table->tab[table->def])
+ def_prio = table->tab[table->def]->prio;
+
+ printk(KERN_DEBUG "GRED: DP %u does not have a prio "
+ "setting default to %d\n", ctl->DP, def_prio);
+
+ prio = def_prio;
+ } else
+ prio = ctl->prio;
+ }
+
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ sch_tree_lock(sch);
+
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
+ if (err < 0)
+ goto errout_locked;
+
+ if (gred_rio_mode(table)) {
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
+ }
+
+ err = 0;
+
+errout_locked:
+ sch_tree_unlock(sch);
+ kfree(prealloc);
+errout:
+ return err;
+}
+
+static int gred_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_GRED_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+ return -EINVAL;
+
+ if (tb[TCA_GRED_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
+ else
+ sch->limit = qdisc_dev(sch)->tx_queue_len
+ * psched_mtu(qdisc_dev(sch));
+
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+}
+
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct nlattr *parms, *opts = NULL;
+ int i;
+ u32 max_p[MAX_DPs];
+ struct tc_gred_sopt sopt = {
+ .DPs = table->DPs,
+ .def_DP = table->def,
+ .grio = gred_rio_mode(table),
+ .flags = table->red_flags,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ max_p[i] = q ? q->parms.max_P : 0;
+ }
+ if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
+ goto nla_put_failure;
+
+ parms = nla_nest_start(skb, TCA_GRED_PARMS);
+ if (parms == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ struct tc_gred_qopt opt;
+ unsigned long qavg;
+
+ memset(&opt, 0, sizeof(opt));
+
+ if (!q) {
+ /* hack -- fix at some point with proper message
+ This is how we indicate to tc that there is no VQ
+ at this DP */
+
+ opt.DP = MAX_DPs + i;
+ goto append_opt;
+ }
+
+ opt.limit = q->limit;
+ opt.DP = q->DP;
+ opt.backlog = gred_backlog(table, q, sch);
+ opt.prio = q->prio;
+ opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
+ opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
+ opt.Wlog = q->parms.Wlog;
+ opt.Plog = q->parms.Plog;
+ opt.Scell_log = q->parms.Scell_log;
+ opt.other = q->stats.other;
+ opt.early = q->stats.prob_drop;
+ opt.forced = q->stats.forced_drop;
+ opt.pdrop = q->stats.pdrop;
+ opt.packets = q->packetsin;
+ opt.bytesin = q->bytesin;
+
+ if (gred_wred_mode(table))
+ gred_load_wred_set(table, q);
+
+ qavg = red_calc_qavg(&q->parms, &q->vars,
+ q->vars.qavg >> q->parms.Wlog);
+ opt.qave = qavg >> q->parms.Wlog;
+
+append_opt:
+ if (nla_append(skb, sizeof(opt), &opt) < 0)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, parms);
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static void gred_destroy(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ int i;
+
+ for (i = 0; i < table->DPs; i++) {
+ if (table->tab[i])
+ gred_destroy_vq(table->tab[i]);
+ }
+}
+
+static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
+ .id = "gred",
+ .priv_size = sizeof(struct gred_sched),
+ .enqueue = gred_enqueue,
+ .dequeue = gred_dequeue,
+ .peek = qdisc_peek_head,
+ .init = gred_init,
+ .reset = gred_reset,
+ .destroy = gred_destroy,
+ .change = gred_change,
+ .dump = gred_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init gred_module_init(void)
+{
+ return register_qdisc(&gred_qdisc_ops);
+}
+
+static void __exit gred_module_exit(void)
+{
+ unregister_qdisc(&gred_qdisc_ops);
+}
+
+module_init(gred_module_init)
+module_exit(gred_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
new file mode 100644
index 000000000..b18ec1f6d
--- /dev/null
+++ b/net/sched/sch_hfsc.c
@@ -0,0 +1,1697 @@
+/*
+ * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * 2003-10-17 - Ported from altq
+ */
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+/*
+ * H-FSC is described in Proceedings of SIGCOMM'97,
+ * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
+ * Real-Time and Priority Service"
+ * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
+ *
+ * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
+ * when a class has an upperlimit, the fit-time is computed from the
+ * upperlimit service curve. the link-sharing scheduler does not schedule
+ * a class whose fit-time exceeds the current time.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <asm/div64.h>
+
+/*
+ * kernel internal service curve representation:
+ * coordinates are given by 64 bit unsigned integers.
+ * x-axis: unit is clock count.
+ * y-axis: unit is byte.
+ *
+ * The service curve parameters are converted to the internal
+ * representation. The slope values are scaled to avoid overflow.
+ * the inverse slope values as well as the y-projection of the 1st
+ * segment are kept in order to avoid 64-bit divide operations
+ * that are expensive on 32-bit architectures.
+ */
+
+struct internal_sc {
+ u64 sm1; /* scaled slope of the 1st segment */
+ u64 ism1; /* scaled inverse-slope of the 1st segment */
+ u64 dx; /* the x-projection of the 1st segment */
+ u64 dy; /* the y-projection of the 1st segment */
+ u64 sm2; /* scaled slope of the 2nd segment */
+ u64 ism2; /* scaled inverse-slope of the 2nd segment */
+};
+
+/* runtime service curve */
+struct runtime_sc {
+ u64 x; /* current starting position on x-axis */
+ u64 y; /* current starting position on y-axis */
+ u64 sm1; /* scaled slope of the 1st segment */
+ u64 ism1; /* scaled inverse-slope of the 1st segment */
+ u64 dx; /* the x-projection of the 1st segment */
+ u64 dy; /* the y-projection of the 1st segment */
+ u64 sm2; /* scaled slope of the 2nd segment */
+ u64 ism2; /* scaled inverse-slope of the 2nd segment */
+};
+
+enum hfsc_class_flags {
+ HFSC_RSC = 0x1,
+ HFSC_FSC = 0x2,
+ HFSC_USC = 0x4
+};
+
+struct hfsc_class {
+ struct Qdisc_class_common cl_common;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct net_rate_estimator __rcu *rate_est;
+ struct tcf_proto __rcu *filter_list; /* filter list */
+ struct tcf_block *block;
+ unsigned int filter_cnt; /* filter count */
+ unsigned int level; /* class level in hierarchy */
+
+ struct hfsc_sched *sched; /* scheduler data */
+ struct hfsc_class *cl_parent; /* parent class */
+ struct list_head siblings; /* sibling classes */
+ struct list_head children; /* child classes */
+ struct Qdisc *qdisc; /* leaf qdisc */
+
+ struct rb_node el_node; /* qdisc's eligible tree member */
+ struct rb_root vt_tree; /* active children sorted by cl_vt */
+ struct rb_node vt_node; /* parent's vt_tree member */
+ struct rb_root cf_tree; /* active children sorted by cl_f */
+ struct rb_node cf_node; /* parent's cf_heap member */
+
+ u64 cl_total; /* total work in bytes */
+ u64 cl_cumul; /* cumulative work in bytes done by
+ real-time criteria */
+
+ u64 cl_d; /* deadline*/
+ u64 cl_e; /* eligible time */
+ u64 cl_vt; /* virtual time */
+ u64 cl_f; /* time when this class will fit for
+ link-sharing, max(myf, cfmin) */
+ u64 cl_myf; /* my fit-time (calculated from this
+ class's own upperlimit curve) */
+ u64 cl_cfmin; /* earliest children's fit-time (used
+ with cl_myf to obtain cl_f) */
+ u64 cl_cvtmin; /* minimal virtual time among the
+ children fit for link-sharing
+ (monotonic within a period) */
+ u64 cl_vtadj; /* intra-period cumulative vt
+ adjustment */
+ u64 cl_cvtoff; /* largest virtual time seen among
+ the children */
+
+ struct internal_sc cl_rsc; /* internal real-time service curve */
+ struct internal_sc cl_fsc; /* internal fair service curve */
+ struct internal_sc cl_usc; /* internal upperlimit service curve */
+ struct runtime_sc cl_deadline; /* deadline curve */
+ struct runtime_sc cl_eligible; /* eligible curve */
+ struct runtime_sc cl_virtual; /* virtual curve */
+ struct runtime_sc cl_ulimit; /* upperlimit curve */
+
+ u8 cl_flags; /* which curves are valid */
+ u32 cl_vtperiod; /* vt period sequence number */
+ u32 cl_parentperiod;/* parent's vt period sequence number*/
+ u32 cl_nactive; /* number of active children */
+};
+
+struct hfsc_sched {
+ u16 defcls; /* default class id */
+ struct hfsc_class root; /* root class */
+ struct Qdisc_class_hash clhash; /* class hash */
+ struct rb_root eligible; /* eligible tree */
+ struct qdisc_watchdog watchdog; /* watchdog timer */
+};
+
+#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
+
+
+/*
+ * eligible tree holds backlogged classes being sorted by their eligible times.
+ * there is one eligible tree per hfsc instance.
+ */
+
+static void
+eltree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->sched->eligible.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, el_node);
+ if (cl->cl_e >= cl1->cl_e)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->el_node, parent, p);
+ rb_insert_color(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_update(struct hfsc_class *cl)
+{
+ eltree_remove(cl);
+ eltree_insert(cl);
+}
+
+/* find the class with the minimum deadline among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_mindl(struct hfsc_sched *q, u64 cur_time)
+{
+ struct hfsc_class *p, *cl = NULL;
+ struct rb_node *n;
+
+ for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) {
+ p = rb_entry(n, struct hfsc_class, el_node);
+ if (p->cl_e > cur_time)
+ break;
+ if (cl == NULL || p->cl_d < cl->cl_d)
+ cl = p;
+ }
+ return cl;
+}
+
+/* find the class with minimum eligible time among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_minel(struct hfsc_sched *q)
+{
+ struct rb_node *n;
+
+ n = rb_first(&q->eligible);
+ if (n == NULL)
+ return NULL;
+ return rb_entry(n, struct hfsc_class, el_node);
+}
+
+/*
+ * vttree holds holds backlogged child classes being sorted by their virtual
+ * time. each intermediate class has one vttree.
+ */
+static void
+vttree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->cl_parent->vt_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, vt_node);
+ if (cl->cl_vt >= cl1->cl_vt)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->vt_node, parent, p);
+ rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_update(struct hfsc_class *cl)
+{
+ vttree_remove(cl);
+ vttree_insert(cl);
+}
+
+static inline struct hfsc_class *
+vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
+{
+ struct hfsc_class *p;
+ struct rb_node *n;
+
+ for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) {
+ p = rb_entry(n, struct hfsc_class, vt_node);
+ if (p->cl_f <= cur_time)
+ return p;
+ }
+ return NULL;
+}
+
+/*
+ * get the leaf class with the minimum vt in the hierarchy
+ */
+static struct hfsc_class *
+vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
+{
+ /* if root-class's cfmin is bigger than cur_time nothing to do */
+ if (cl->cl_cfmin > cur_time)
+ return NULL;
+
+ while (cl->level > 0) {
+ cl = vttree_firstfit(cl, cur_time);
+ if (cl == NULL)
+ return NULL;
+ /*
+ * update parent's cl_cvtmin.
+ */
+ if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
+ cl->cl_parent->cl_cvtmin = cl->cl_vt;
+ }
+ return cl;
+}
+
+static void
+cftree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->cl_parent->cf_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, cf_node);
+ if (cl->cl_f >= cl1->cl_f)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->cf_node, parent, p);
+ rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_update(struct hfsc_class *cl)
+{
+ cftree_remove(cl);
+ cftree_insert(cl);
+}
+
+/*
+ * service curve support functions
+ *
+ * external service curve parameters
+ * m: bps
+ * d: us
+ * internal service curve parameters
+ * sm: (bytes/psched_us) << SM_SHIFT
+ * ism: (psched_us/byte) << ISM_SHIFT
+ * dx: psched_us
+ *
+ * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us.
+ *
+ * sm and ism are scaled in order to keep effective digits.
+ * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
+ * digits in decimal using the following table.
+ *
+ * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps
+ * ------------+-------------------------------------------------------
+ * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3
+ *
+ * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125
+ *
+ * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.
+ */
+#define SM_SHIFT (30 - PSCHED_SHIFT)
+#define ISM_SHIFT (8 + PSCHED_SHIFT)
+
+#define SM_MASK ((1ULL << SM_SHIFT) - 1)
+#define ISM_MASK ((1ULL << ISM_SHIFT) - 1)
+
+static inline u64
+seg_x2y(u64 x, u64 sm)
+{
+ u64 y;
+
+ /*
+ * compute
+ * y = x * sm >> SM_SHIFT
+ * but divide it for the upper and lower bits to avoid overflow
+ */
+ y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
+ return y;
+}
+
+static inline u64
+seg_y2x(u64 y, u64 ism)
+{
+ u64 x;
+
+ if (y == 0)
+ x = 0;
+ else if (ism == HT_INFINITY)
+ x = HT_INFINITY;
+ else {
+ x = (y >> ISM_SHIFT) * ism
+ + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
+ }
+ return x;
+}
+
+/* Convert m (bps) into sm (bytes/psched us) */
+static u64
+m2sm(u32 m)
+{
+ u64 sm;
+
+ sm = ((u64)m << SM_SHIFT);
+ sm += PSCHED_TICKS_PER_SEC - 1;
+ do_div(sm, PSCHED_TICKS_PER_SEC);
+ return sm;
+}
+
+/* convert m (bps) into ism (psched us/byte) */
+static u64
+m2ism(u32 m)
+{
+ u64 ism;
+
+ if (m == 0)
+ ism = HT_INFINITY;
+ else {
+ ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT);
+ ism += m - 1;
+ do_div(ism, m);
+ }
+ return ism;
+}
+
+/* convert d (us) into dx (psched us) */
+static u64
+d2dx(u32 d)
+{
+ u64 dx;
+
+ dx = ((u64)d * PSCHED_TICKS_PER_SEC);
+ dx += USEC_PER_SEC - 1;
+ do_div(dx, USEC_PER_SEC);
+ return dx;
+}
+
+/* convert sm (bytes/psched us) into m (bps) */
+static u32
+sm2m(u64 sm)
+{
+ u64 m;
+
+ m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT;
+ return (u32)m;
+}
+
+/* convert dx (psched us) into d (us) */
+static u32
+dx2d(u64 dx)
+{
+ u64 d;
+
+ d = dx * USEC_PER_SEC;
+ do_div(d, PSCHED_TICKS_PER_SEC);
+ return (u32)d;
+}
+
+static void
+sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
+{
+ isc->sm1 = m2sm(sc->m1);
+ isc->ism1 = m2ism(sc->m1);
+ isc->dx = d2dx(sc->d);
+ isc->dy = seg_x2y(isc->dx, isc->sm1);
+ isc->sm2 = m2sm(sc->m2);
+ isc->ism2 = m2ism(sc->m2);
+}
+
+/*
+ * initialize the runtime service curve with the given internal
+ * service curve starting at (x, y).
+ */
+static void
+rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->sm1 = isc->sm1;
+ rtsc->ism1 = isc->ism1;
+ rtsc->dx = isc->dx;
+ rtsc->dy = isc->dy;
+ rtsc->sm2 = isc->sm2;
+ rtsc->ism2 = isc->ism2;
+}
+
+/*
+ * calculate the y-projection of the runtime service curve by the
+ * given x-projection value
+ */
+static u64
+rtsc_y2x(struct runtime_sc *rtsc, u64 y)
+{
+ u64 x;
+
+ if (y < rtsc->y)
+ x = rtsc->x;
+ else if (y <= rtsc->y + rtsc->dy) {
+ /* x belongs to the 1st segment */
+ if (rtsc->dy == 0)
+ x = rtsc->x + rtsc->dx;
+ else
+ x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
+ } else {
+ /* x belongs to the 2nd segment */
+ x = rtsc->x + rtsc->dx
+ + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
+ }
+ return x;
+}
+
+static u64
+rtsc_x2y(struct runtime_sc *rtsc, u64 x)
+{
+ u64 y;
+
+ if (x <= rtsc->x)
+ y = rtsc->y;
+ else if (x <= rtsc->x + rtsc->dx)
+ /* y belongs to the 1st segment */
+ y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
+ else
+ /* y belongs to the 2nd segment */
+ y = rtsc->y + rtsc->dy
+ + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
+ return y;
+}
+
+/*
+ * update the runtime service curve by taking the minimum of the current
+ * runtime service curve and the service curve starting at (x, y).
+ */
+static void
+rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+ u64 y1, y2, dx, dy;
+ u32 dsm;
+
+ if (isc->sm1 <= isc->sm2) {
+ /* service curve is convex */
+ y1 = rtsc_x2y(rtsc, x);
+ if (y1 < y)
+ /* the current rtsc is smaller */
+ return;
+ rtsc->x = x;
+ rtsc->y = y;
+ return;
+ }
+
+ /*
+ * service curve is concave
+ * compute the two y values of the current rtsc
+ * y1: at x
+ * y2: at (x + dx)
+ */
+ y1 = rtsc_x2y(rtsc, x);
+ if (y1 <= y) {
+ /* rtsc is below isc, no change to rtsc */
+ return;
+ }
+
+ y2 = rtsc_x2y(rtsc, x + isc->dx);
+ if (y2 >= y + isc->dy) {
+ /* rtsc is above isc, replace rtsc by isc */
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->dx = isc->dx;
+ rtsc->dy = isc->dy;
+ return;
+ }
+
+ /*
+ * the two curves intersect
+ * compute the offsets (dx, dy) using the reverse
+ * function of seg_x2y()
+ * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
+ */
+ dx = (y1 - y) << SM_SHIFT;
+ dsm = isc->sm1 - isc->sm2;
+ do_div(dx, dsm);
+ /*
+ * check if (x, y1) belongs to the 1st segment of rtsc.
+ * if so, add the offset.
+ */
+ if (rtsc->x + rtsc->dx > x)
+ dx += rtsc->x + rtsc->dx - x;
+ dy = seg_x2y(dx, isc->sm1);
+
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->dx = dx;
+ rtsc->dy = dy;
+}
+
+static void
+init_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+ u64 cur_time = psched_get_time();
+
+ /* update the deadline curve */
+ rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+
+ /*
+ * update the eligible curve.
+ * for concave, it is equal to the deadline curve.
+ * for convex, it is a linear curve with slope m2.
+ */
+ cl->cl_eligible = cl->cl_deadline;
+ if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+ cl->cl_eligible.dx = 0;
+ cl->cl_eligible.dy = 0;
+ }
+
+ /* compute e and d */
+ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+ eltree_insert(cl);
+}
+
+static void
+update_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+ eltree_update(cl);
+}
+
+static inline void
+update_d(struct hfsc_class *cl, unsigned int next_len)
+{
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+}
+
+static inline void
+update_cfmin(struct hfsc_class *cl)
+{
+ struct rb_node *n = rb_first(&cl->cf_tree);
+ struct hfsc_class *p;
+
+ if (n == NULL) {
+ cl->cl_cfmin = 0;
+ return;
+ }
+ p = rb_entry(n, struct hfsc_class, cf_node);
+ cl->cl_cfmin = p->cl_f;
+}
+
+static void
+init_vf(struct hfsc_class *cl, unsigned int len)
+{
+ struct hfsc_class *max_cl;
+ struct rb_node *n;
+ u64 vt, f, cur_time;
+ int go_active;
+
+ cur_time = 0;
+ go_active = 1;
+ for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+ if (go_active && cl->cl_nactive++ == 0)
+ go_active = 1;
+ else
+ go_active = 0;
+
+ if (go_active) {
+ n = rb_last(&cl->cl_parent->vt_tree);
+ if (n != NULL) {
+ max_cl = rb_entry(n, struct hfsc_class, vt_node);
+ /*
+ * set vt to the average of the min and max
+ * classes. if the parent's period didn't
+ * change, don't decrease vt of the class.
+ */
+ vt = max_cl->cl_vt;
+ if (cl->cl_parent->cl_cvtmin != 0)
+ vt = (cl->cl_parent->cl_cvtmin + vt)/2;
+
+ if (cl->cl_parent->cl_vtperiod !=
+ cl->cl_parentperiod || vt > cl->cl_vt)
+ cl->cl_vt = vt;
+ } else {
+ /*
+ * first child for a new parent backlog period.
+ * initialize cl_vt to the highest value seen
+ * among the siblings. this is analogous to
+ * what cur_time would provide in realtime case.
+ */
+ cl->cl_vt = cl->cl_parent->cl_cvtoff;
+ cl->cl_parent->cl_cvtmin = 0;
+ }
+
+ /* update the virtual curve */
+ rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ cl->cl_vtadj = 0;
+
+ cl->cl_vtperiod++; /* increment vt period */
+ cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
+ if (cl->cl_parent->cl_nactive == 0)
+ cl->cl_parentperiod++;
+ cl->cl_f = 0;
+
+ vttree_insert(cl);
+ cftree_insert(cl);
+
+ if (cl->cl_flags & HFSC_USC) {
+ /* class has upper limit curve */
+ if (cur_time == 0)
+ cur_time = psched_get_time();
+
+ /* update the ulimit curve */
+ rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
+ cl->cl_total);
+ /* compute myf */
+ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
+ cl->cl_total);
+ }
+ }
+
+ f = max(cl->cl_myf, cl->cl_cfmin);
+ if (f != cl->cl_f) {
+ cl->cl_f = f;
+ cftree_update(cl);
+ }
+ update_cfmin(cl->cl_parent);
+ }
+}
+
+static void
+update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
+{
+ u64 f; /* , myf_bound, delta; */
+ int go_passive = 0;
+
+ if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
+ go_passive = 1;
+
+ for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+ cl->cl_total += len;
+
+ if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
+ continue;
+
+ if (go_passive && --cl->cl_nactive == 0)
+ go_passive = 1;
+ else
+ go_passive = 0;
+
+ /* update vt */
+ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj;
+
+ /*
+ * if vt of the class is smaller than cvtmin,
+ * the class was skipped in the past due to non-fit.
+ * if so, we need to adjust vtadj.
+ */
+ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+ cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+ cl->cl_vt = cl->cl_parent->cl_cvtmin;
+ }
+
+ if (go_passive) {
+ /* no more active child, going passive */
+
+ /* update cvtoff of the parent class */
+ if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
+ cl->cl_parent->cl_cvtoff = cl->cl_vt;
+
+ /* remove this class from the vt tree */
+ vttree_remove(cl);
+
+ cftree_remove(cl);
+ update_cfmin(cl->cl_parent);
+
+ continue;
+ }
+
+ /* update the vt tree */
+ vttree_update(cl);
+
+ /* update f */
+ if (cl->cl_flags & HFSC_USC) {
+ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total);
+#if 0
+ cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
+ cl->cl_total);
+ /*
+ * This code causes classes to stay way under their
+ * limit when multiple classes are used at gigabit
+ * speed. needs investigation. -kaber
+ */
+ /*
+ * if myf lags behind by more than one clock tick
+ * from the current time, adjust myfadj to prevent
+ * a rate-limited class from going greedy.
+ * in a steady state under rate-limiting, myf
+ * fluctuates within one clock tick.
+ */
+ myf_bound = cur_time - PSCHED_JIFFIE2US(1);
+ if (cl->cl_myf < myf_bound) {
+ delta = cur_time - cl->cl_myf;
+ cl->cl_myfadj += delta;
+ cl->cl_myf += delta;
+ }
+#endif
+ }
+
+ f = max(cl->cl_myf, cl->cl_cfmin);
+ if (f != cl->cl_f) {
+ cl->cl_f = f;
+ cftree_update(cl);
+ update_cfmin(cl->cl_parent);
+ }
+ }
+}
+
+static unsigned int
+qdisc_peek_len(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ unsigned int len;
+
+ skb = sch->ops->peek(sch);
+ if (unlikely(skb == NULL)) {
+ qdisc_warn_nonwc("qdisc_peek_len", sch);
+ return 0;
+ }
+ len = qdisc_pkt_len(skb);
+
+ return len;
+}
+
+static void
+hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
+}
+
+static void
+hfsc_adjust_levels(struct hfsc_class *cl)
+{
+ struct hfsc_class *p;
+ unsigned int level;
+
+ do {
+ level = 0;
+ list_for_each_entry(p, &cl->children, siblings) {
+ if (p->level >= level)
+ level = p->level + 1;
+ }
+ cl->level = level;
+ } while ((cl = cl->cl_parent) != NULL);
+}
+
+static inline struct hfsc_class *
+hfsc_find_class(u32 classid, struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct hfsc_class, cl_common);
+}
+
+static void
+hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
+ u64 cur_time)
+{
+ sc2isc(rsc, &cl->cl_rsc);
+ rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+ cl->cl_eligible = cl->cl_deadline;
+ if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+ cl->cl_eligible.dx = 0;
+ cl->cl_eligible.dy = 0;
+ }
+ cl->cl_flags |= HFSC_RSC;
+}
+
+static void
+hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
+{
+ sc2isc(fsc, &cl->cl_fsc);
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ cl->cl_flags |= HFSC_FSC;
+}
+
+static void
+hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
+ u64 cur_time)
+{
+ sc2isc(usc, &cl->cl_usc);
+ rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
+ cl->cl_flags |= HFSC_USC;
+}
+
+static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
+ [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) },
+ [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) },
+ [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) },
+};
+
+static int
+hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)*arg;
+ struct hfsc_class *parent = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_HFSC_MAX + 1];
+ struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
+ u64 cur_time;
+ int err;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_HFSC_RSC]) {
+ rsc = nla_data(tb[TCA_HFSC_RSC]);
+ if (rsc->m1 == 0 && rsc->m2 == 0)
+ rsc = NULL;
+ }
+
+ if (tb[TCA_HFSC_FSC]) {
+ fsc = nla_data(tb[TCA_HFSC_FSC]);
+ if (fsc->m1 == 0 && fsc->m2 == 0)
+ fsc = NULL;
+ }
+
+ if (tb[TCA_HFSC_USC]) {
+ usc = nla_data(tb[TCA_HFSC_USC]);
+ if (usc->m1 == 0 && usc->m2 == 0)
+ usc = NULL;
+ }
+
+ if (cl != NULL) {
+ int old_flags;
+
+ if (parentid) {
+ if (cl->cl_parent &&
+ cl->cl_parent->cl_common.classid != parentid)
+ return -EINVAL;
+ if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
+ return -EINVAL;
+ }
+ cur_time = psched_get_time();
+
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+
+ sch_tree_lock(sch);
+ old_flags = cl->cl_flags;
+
+ if (rsc != NULL)
+ hfsc_change_rsc(cl, rsc, cur_time);
+ if (fsc != NULL)
+ hfsc_change_fsc(cl, fsc);
+ if (usc != NULL)
+ hfsc_change_usc(cl, usc, cur_time);
+
+ if (cl->qdisc->q.qlen != 0) {
+ int len = qdisc_peek_len(cl->qdisc);
+
+ if (cl->cl_flags & HFSC_RSC) {
+ if (old_flags & HFSC_RSC)
+ update_ed(cl, len);
+ else
+ init_ed(cl, len);
+ }
+
+ if (cl->cl_flags & HFSC_FSC) {
+ if (old_flags & HFSC_FSC)
+ update_vf(cl, 0, cur_time);
+ else
+ init_vf(cl, len);
+ }
+ }
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ if (parentid == TC_H_ROOT)
+ return -EEXIST;
+
+ parent = &q->root;
+ if (parentid) {
+ parent = hfsc_find_class(parentid, sch);
+ if (parent == NULL)
+ return -ENOENT;
+ }
+
+ if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
+ return -EINVAL;
+ if (hfsc_find_class(classid, sch))
+ return -EEXIST;
+
+ if (rsc == NULL && fsc == NULL)
+ return -EINVAL;
+
+ cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
+ if (err) {
+ kfree(cl);
+ return err;
+ }
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ tcf_block_put(cl->block);
+ kfree(cl);
+ return err;
+ }
+ }
+
+ if (rsc != NULL)
+ hfsc_change_rsc(cl, rsc, 0);
+ if (fsc != NULL)
+ hfsc_change_fsc(cl, fsc);
+ if (usc != NULL)
+ hfsc_change_usc(cl, usc, 0);
+
+ cl->cl_common.classid = classid;
+ cl->sched = q;
+ cl->cl_parent = parent;
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ classid, NULL);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+ else
+ qdisc_hash_add(cl->qdisc, true);
+ INIT_LIST_HEAD(&cl->children);
+ cl->vt_tree = RB_ROOT;
+ cl->cf_tree = RB_ROOT;
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
+ list_add_tail(&cl->siblings, &parent->children);
+ if (parent->level == 0)
+ hfsc_purge_queue(sch, parent);
+ hfsc_adjust_levels(parent);
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+}
+
+static void
+hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+
+ tcf_block_put(cl->block);
+ qdisc_put(cl->qdisc);
+ gen_kill_estimator(&cl->rate_est);
+ if (cl != &q->root)
+ kfree(cl);
+}
+
+static int
+hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ list_del(&cl->siblings);
+ hfsc_adjust_levels(cl->cl_parent);
+
+ hfsc_purge_queue(sch, cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
+
+ sch_tree_unlock(sch);
+
+ hfsc_destroy_class(sch, cl);
+ return 0;
+}
+
+static struct hfsc_class *
+hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *head, *cl;
+ struct tcf_result res;
+ struct tcf_proto *tcf;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
+ (cl = hfsc_find_class(skb->priority, sch)) != NULL)
+ if (cl->level == 0)
+ return cl;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ head = &q->root;
+ tcf = rcu_dereference_bh(q->root.filter_list);
+ while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct hfsc_class *)res.class;
+ if (!cl) {
+ cl = hfsc_find_class(res.classid, sch);
+ if (!cl)
+ break; /* filter selected invalid classid */
+ if (cl->level >= head->level)
+ break; /* filter may only point downwards */
+ }
+
+ if (cl->level == 0)
+ return cl; /* hit leaf class */
+
+ /* apply inner filter chain */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ head = cl;
+ }
+
+ /* classification failed, try default class */
+ cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+ if (cl == NULL || cl->level > 0)
+ return NULL;
+
+ return cl;
+}
+
+static int
+hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level > 0)
+ return -EINVAL;
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->cl_common.classid, NULL);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ return 0;
+}
+
+static struct Qdisc *
+hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level == 0)
+ return cl->qdisc;
+
+ return NULL;
+}
+
+static void
+hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+ * needs to be called explicitly to remove a class from vttree.
+ */
+ update_vf(cl, 0, 0);
+ if (cl->cl_flags & HFSC_RSC)
+ eltree_remove(cl);
+}
+
+static unsigned long
+hfsc_search_class(struct Qdisc *sch, u32 classid)
+{
+ return (unsigned long)hfsc_find_class(classid, sch);
+}
+
+static unsigned long
+hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ struct hfsc_class *p = (struct hfsc_class *)parent;
+ struct hfsc_class *cl = hfsc_find_class(classid, sch);
+
+ if (cl != NULL) {
+ if (p != NULL && p->level <= cl->level)
+ return 0;
+ cl->filter_cnt++;
+ }
+
+ return (unsigned long)cl;
+}
+
+static void
+hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl == NULL)
+ cl = &q->root;
+
+ return cl->block;
+}
+
+static int
+hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
+{
+ struct tc_service_curve tsc;
+
+ tsc.m1 = sm2m(sc->sm1);
+ tsc.d = dx2d(sc->dx);
+ tsc.m2 = sm2m(sc->sm2);
+ if (nla_put(skb, attr, sizeof(tsc), &tsc))
+ goto nla_put_failure;
+
+ return skb->len;
+
+ nla_put_failure:
+ return -1;
+}
+
+static int
+hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
+{
+ if ((cl->cl_flags & HFSC_RSC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
+ goto nla_put_failure;
+
+ if ((cl->cl_flags & HFSC_FSC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
+ goto nla_put_failure;
+
+ if ((cl->cl_flags & HFSC_USC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
+ goto nla_put_failure;
+
+ return skb->len;
+
+ nla_put_failure:
+ return -1;
+}
+
+static int
+hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
+ struct tcmsg *tcm)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid :
+ TC_H_ROOT;
+ tcm->tcm_handle = cl->cl_common.classid;
+ if (cl->level == 0)
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (hfsc_dump_curves(skb, cl) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+ nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+ struct tc_hfsc_stats xstats;
+
+ cl->qstats.backlog = cl->qdisc->qstats.backlog;
+ xstats.level = cl->level;
+ xstats.period = cl->cl_vtperiod;
+ xstats.work = cl->cl_total;
+ xstats.rtwork = cl->cl_cumul;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+
+
+static void
+hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i],
+ cl_common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static void
+hfsc_schedule_watchdog(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ u64 next_time = 0;
+
+ cl = eltree_get_minel(q);
+ if (cl)
+ next_time = cl->cl_e;
+ if (q->root.cl_cfmin != 0) {
+ if (next_time == 0 || next_time > q->root.cl_cfmin)
+ next_time = q->root.cl_cfmin;
+ }
+ if (next_time)
+ qdisc_watchdog_schedule(&q->watchdog, next_time);
+}
+
+static int
+hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct tc_hfsc_qopt *qopt;
+ int err;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (!opt || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ q->defcls = qopt->defcls;
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+ q->eligible = RB_ROOT;
+
+ err = tcf_block_get(&q->root.block, &q->root.filter_list, sch, extack);
+ if (err)
+ return err;
+
+ q->root.cl_common.classid = sch->handle;
+ q->root.sched = q;
+ q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (q->root.qdisc == NULL)
+ q->root.qdisc = &noop_qdisc;
+ else
+ qdisc_hash_add(q->root.qdisc, true);
+ INIT_LIST_HEAD(&q->root.children);
+ q->root.vt_tree = RB_ROOT;
+ q->root.cf_tree = RB_ROOT;
+
+ qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ return 0;
+}
+
+static int
+hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct tc_hfsc_qopt *qopt;
+
+ if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ sch_tree_lock(sch);
+ q->defcls = qopt->defcls;
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static void
+hfsc_reset_class(struct hfsc_class *cl)
+{
+ cl->cl_total = 0;
+ cl->cl_cumul = 0;
+ cl->cl_d = 0;
+ cl->cl_e = 0;
+ cl->cl_vt = 0;
+ cl->cl_vtadj = 0;
+ cl->cl_cvtmin = 0;
+ cl->cl_cvtoff = 0;
+ cl->cl_vtperiod = 0;
+ cl->cl_parentperiod = 0;
+ cl->cl_f = 0;
+ cl->cl_myf = 0;
+ cl->cl_cfmin = 0;
+ cl->cl_nactive = 0;
+
+ cl->vt_tree = RB_ROOT;
+ cl->cf_tree = RB_ROOT;
+ qdisc_reset(cl->qdisc);
+
+ if (cl->cl_flags & HFSC_RSC)
+ rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
+ if (cl->cl_flags & HFSC_FSC)
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
+ if (cl->cl_flags & HFSC_USC)
+ rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
+}
+
+static void
+hfsc_reset_qdisc(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+ hfsc_reset_class(cl);
+ }
+ q->eligible = RB_ROOT;
+ qdisc_watchdog_cancel(&q->watchdog);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void
+hfsc_destroy_qdisc(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) {
+ tcf_block_put(cl->block);
+ cl->block = NULL;
+ }
+ }
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ cl_common.hnode)
+ hfsc_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int
+hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_hfsc_qopt qopt;
+
+ qopt.defcls = q->defcls;
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
+ return skb->len;
+
+ nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct hfsc_class *cl;
+ int uninitialized_var(err);
+
+ cl = hfsc_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (cl->qdisc->q.qlen == 1) {
+ unsigned int len = qdisc_pkt_len(skb);
+
+ if (cl->cl_flags & HFSC_RSC)
+ init_ed(cl, len);
+ if (cl->cl_flags & HFSC_FSC)
+ init_vf(cl, len);
+ /*
+ * If this is the first packet, isolate the head so an eventual
+ * head drop before the first dequeue operation has no chance
+ * to invalidate the deadline.
+ */
+ if (cl->cl_flags & HFSC_RSC)
+ cl->qdisc->ops->peek(cl->qdisc);
+
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+hfsc_dequeue(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ struct sk_buff *skb;
+ u64 cur_time;
+ unsigned int next_len;
+ int realtime = 0;
+
+ if (sch->q.qlen == 0)
+ return NULL;
+
+ cur_time = psched_get_time();
+
+ /*
+ * if there are eligible classes, use real-time criteria.
+ * find the class with the minimum deadline among
+ * the eligible classes.
+ */
+ cl = eltree_get_mindl(q, cur_time);
+ if (cl) {
+ realtime = 1;
+ } else {
+ /*
+ * use link-sharing criteria
+ * get the class with the minimum vt in the hierarchy
+ */
+ cl = vttree_get_minvt(&q->root, cur_time);
+ if (cl == NULL) {
+ qdisc_qstats_overlimit(sch);
+ hfsc_schedule_watchdog(sch);
+ return NULL;
+ }
+ }
+
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb == NULL) {
+ qdisc_warn_nonwc("HFSC", cl->qdisc);
+ return NULL;
+ }
+
+ bstats_update(&cl->bstats, skb);
+ update_vf(cl, qdisc_pkt_len(skb), cur_time);
+ if (realtime)
+ cl->cl_cumul += qdisc_pkt_len(skb);
+
+ if (cl->cl_flags & HFSC_RSC) {
+ if (cl->qdisc->q.qlen != 0) {
+ /* update ed */
+ next_len = qdisc_peek_len(cl->qdisc);
+ if (realtime)
+ update_ed(cl, next_len);
+ else
+ update_d(cl, next_len);
+ } else {
+ /* the class becomes passive */
+ eltree_remove(cl);
+ }
+ }
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static const struct Qdisc_class_ops hfsc_class_ops = {
+ .change = hfsc_change_class,
+ .delete = hfsc_delete_class,
+ .graft = hfsc_graft_class,
+ .leaf = hfsc_class_leaf,
+ .qlen_notify = hfsc_qlen_notify,
+ .find = hfsc_search_class,
+ .bind_tcf = hfsc_bind_tcf,
+ .unbind_tcf = hfsc_unbind_tcf,
+ .tcf_block = hfsc_tcf_block,
+ .dump = hfsc_dump_class,
+ .dump_stats = hfsc_dump_class_stats,
+ .walk = hfsc_walk
+};
+
+static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
+ .id = "hfsc",
+ .init = hfsc_init_qdisc,
+ .change = hfsc_change_qdisc,
+ .reset = hfsc_reset_qdisc,
+ .destroy = hfsc_destroy_qdisc,
+ .dump = hfsc_dump_qdisc,
+ .enqueue = hfsc_enqueue,
+ .dequeue = hfsc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .cl_ops = &hfsc_class_ops,
+ .priv_size = sizeof(struct hfsc_sched),
+ .owner = THIS_MODULE
+};
+
+static int __init
+hfsc_init(void)
+{
+ return register_qdisc(&hfsc_qdisc_ops);
+}
+
+static void __exit
+hfsc_cleanup(void)
+{
+ unregister_qdisc(&hfsc_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(hfsc_init);
+module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 000000000..c53b72067
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,721 @@
+/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
+ *
+ * Copyright (C) 2013 Terry Lam <vtlam@google.com>
+ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
+ */
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/siphash.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+/* Heavy-Hitter Filter (HHF)
+ *
+ * Principles :
+ * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
+ * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
+ * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
+ * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
+ * in which the heavy-hitter bucket is served with less weight.
+ * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
+ * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
+ * higher share of bandwidth.
+ *
+ * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
+ * following paper:
+ * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
+ * Accounting", in ACM SIGCOMM, 2002.
+ *
+ * Conceptually, a multi-stage filter comprises k independent hash functions
+ * and k counter arrays. Packets are indexed into k counter arrays by k hash
+ * functions, respectively. The counters are then increased by the packet sizes.
+ * Therefore,
+ * - For a heavy-hitter flow: *all* of its k array counters must be large.
+ * - For a non-heavy-hitter flow: some of its k array counters can be large
+ * due to hash collision with other small flows; however, with high
+ * probability, not *all* k counters are large.
+ *
+ * By the design of the multi-stage filter algorithm, the false negative rate
+ * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
+ * susceptible to false positives (non-heavy-hitters mistakenly classified as
+ * heavy-hitters).
+ * Therefore, we also implement the following optimizations to reduce false
+ * positives by avoiding unnecessary increment of the counter values:
+ * - Optimization O1: once a heavy-hitter is identified, its bytes are not
+ * accounted in the array counters. This technique is called "shielding"
+ * in Section 3.3.1 of [EV02].
+ * - Optimization O2: conservative update of counters
+ * (Section 3.3.2 of [EV02]),
+ * New counter value = max {old counter value,
+ * smallest counter value + packet bytes}
+ *
+ * Finally, we refresh the counters periodically since otherwise the counter
+ * values will keep accumulating.
+ *
+ * Once a flow is classified as heavy-hitter, we also save its per-flow state
+ * in an exact-matching flow table so that its subsequent packets can be
+ * dispatched to the heavy-hitter bucket accordingly.
+ *
+ *
+ * At a high level, this qdisc works as follows:
+ * Given a packet p:
+ * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
+ * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
+ * bucket.
+ * - Otherwise, forward p to the multi-stage filter, denoted filter F
+ * + If F decides that p belongs to a non-heavy-hitter flow, then send p
+ * to the non-heavy-hitter bucket.
+ * + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
+ * then set up a new flow entry for the flow-id of p in the table T and
+ * send p to the heavy-hitter bucket.
+ *
+ * In this implementation:
+ * - T is a fixed-size hash-table with 1024 entries. Hash collision is
+ * resolved by linked-list chaining.
+ * - F has four counter arrays, each array containing 1024 32-bit counters.
+ * That means 4 * 1024 * 32 bits = 16KB of memory.
+ * - Since each array in F contains 1024 counters, 10 bits are sufficient to
+ * index into each array.
+ * Hence, instead of having four hash functions, we chop the 32-bit
+ * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
+ * computed as XOR sum of those three chunks.
+ * - We need to clear the counter arrays periodically; however, directly
+ * memsetting 16KB of memory can lead to cache eviction and unwanted delay.
+ * So by representing each counter by a valid bit, we only need to reset
+ * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
+ * - The Deficit Round Robin engine is taken from fq_codel implementation
+ * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
+ * fq_codel_flow in fq_codel implementation.
+ *
+ */
+
+/* Non-configurable parameters */
+#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
+#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
+#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
+#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
+#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
+
+#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
+enum wdrr_bucket_idx {
+ WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
+ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
+};
+
+#define hhf_time_before(a, b) \
+ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
+
+/* Heavy-hitter per-flow state */
+struct hh_flow_state {
+ u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
+ u32 hit_timestamp; /* last time heavy-hitter was seen */
+ struct list_head flowchain; /* chaining under hash collision */
+};
+
+/* Weighted Deficit Round Robin (WDRR) scheduler */
+struct wdrr_bucket {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head bucketchain;
+ int deficit;
+};
+
+struct hhf_sched_data {
+ struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
+ siphash_key_t perturbation; /* hash perturbation */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_overlimit; /* number of times max qdisc packet
+ * limit was hit
+ */
+ struct list_head *hh_flows; /* table T (currently active HHs) */
+ u32 hh_flows_limit; /* max active HH allocs */
+ u32 hh_flows_overlimit; /* num of disallowed HH allocs */
+ u32 hh_flows_total_cnt; /* total admitted HHs */
+ u32 hh_flows_current_cnt; /* total current HHs */
+ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
+ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
+ * was reset
+ */
+ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
+ * of hhf_arrays
+ */
+ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
+ struct list_head new_buckets; /* list of new buckets */
+ struct list_head old_buckets; /* list of old buckets */
+
+ /* Configurable HHF parameters */
+ u32 hhf_reset_timeout; /* interval to reset counter
+ * arrays in filter F
+ * (default 40ms)
+ */
+ u32 hhf_admit_bytes; /* counter thresh to classify as
+ * HH (default 128KB).
+ * With these default values,
+ * 128KB / 40ms = 25 Mbps
+ * i.e., we expect to capture HHs
+ * sending > 25 Mbps.
+ */
+ u32 hhf_evict_timeout; /* aging threshold to evict idle
+ * HHs out of table T. This should
+ * be large enough to avoid
+ * reordering during HH eviction.
+ * (default 1s)
+ */
+ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
+ * (default 2,
+ * i.e., non-HH : HH = 2 : 1)
+ */
+};
+
+static u32 hhf_time_stamp(void)
+{
+ return jiffies;
+}
+
+/* Looks up a heavy-hitter flow in a chaining list of table T. */
+static struct hh_flow_state *seek_list(const u32 hash,
+ struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow, *next;
+ u32 now = hhf_time_stamp();
+
+ if (list_empty(head))
+ return NULL;
+
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now)) {
+ /* Delete expired heavy-hitters, but preserve one entry
+ * to avoid kzalloc() when next time this slot is hit.
+ */
+ if (list_is_last(&flow->flowchain, head))
+ return NULL;
+ list_del(&flow->flowchain);
+ kfree(flow);
+ q->hh_flows_current_cnt--;
+ } else if (flow->hash_id == hash) {
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
+ * entry or dynamically alloc a new entry.
+ */
+static struct hh_flow_state *alloc_new_hh(struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow;
+ u32 now = hhf_time_stamp();
+
+ if (!list_empty(head)) {
+ /* Find an expired heavy-hitter flow entry. */
+ list_for_each_entry(flow, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now))
+ return flow;
+ }
+ }
+
+ if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
+ q->hh_flows_overlimit++;
+ return NULL;
+ }
+ /* Create new entry. */
+ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
+ if (!flow)
+ return NULL;
+
+ q->hh_flows_current_cnt++;
+ INIT_LIST_HEAD(&flow->flowchain);
+ list_add_tail(&flow->flowchain, head);
+
+ return flow;
+}
+
+/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
+ * classify heavy-hitters.
+ */
+static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ u32 tmp_hash, hash;
+ u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
+ struct hh_flow_state *flow;
+ u32 pkt_len, min_hhf_val;
+ int i;
+ u32 prev;
+ u32 now = hhf_time_stamp();
+
+ /* Reset the HHF counter arrays if this is the right time. */
+ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
+ if (hhf_time_before(prev, now)) {
+ for (i = 0; i < HHF_ARRAYS_CNT; i++)
+ bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
+ q->hhf_arrays_reset_timestamp = now;
+ }
+
+ /* Get hashed flow-id of the skb. */
+ hash = skb_get_hash_perturb(skb, &q->perturbation);
+
+ /* Check if this packet belongs to an already established HH flow. */
+ flow_pos = hash & HHF_BIT_MASK;
+ flow = seek_list(hash, &q->hh_flows[flow_pos], q);
+ if (flow) { /* found its HH flow */
+ flow->hit_timestamp = now;
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Now pass the packet through the multi-stage filter. */
+ tmp_hash = hash;
+ xorsum = 0;
+ for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
+ /* Split the skb_hash into three 10-bit chunks. */
+ filter_pos[i] = tmp_hash & HHF_BIT_MASK;
+ xorsum ^= filter_pos[i];
+ tmp_hash >>= HHF_BIT_MASK_LEN;
+ }
+ /* The last chunk is computed as XOR sum of other chunks. */
+ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
+
+ pkt_len = qdisc_pkt_len(skb);
+ min_hhf_val = ~0U;
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ u32 val;
+
+ if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
+ q->hhf_arrays[i][filter_pos[i]] = 0;
+ __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
+ }
+
+ val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
+ if (min_hhf_val > val)
+ min_hhf_val = val;
+ }
+
+ /* Found a new HH iff all counter values > HH admit threshold. */
+ if (min_hhf_val > q->hhf_admit_bytes) {
+ /* Just captured a new heavy-hitter. */
+ flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
+ if (!flow) /* memory alloc problem */
+ return WDRR_BUCKET_FOR_NON_HH;
+ flow->hash_id = hash;
+ flow->hit_timestamp = now;
+ q->hh_flows_total_cnt++;
+
+ /* By returning without updating counters in q->hhf_arrays,
+ * we implicitly implement "shielding" (see Optimization O1).
+ */
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Conservative update of HHF arrays (see Optimization O2). */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
+ q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
+ }
+ return WDRR_BUCKET_FOR_NON_HH;
+}
+
+/* Removes one skb from head of bucket. */
+static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
+{
+ struct sk_buff *skb = bucket->head;
+
+ bucket->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* Tail-adds skb to bucket. */
+static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
+{
+ if (bucket->head == NULL)
+ bucket->head = skb;
+ else
+ bucket->tail->next = skb;
+ bucket->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct wdrr_bucket *bucket;
+
+ /* Always try to drop from heavy-hitters first. */
+ bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
+ if (!bucket->head)
+ bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
+
+ if (bucket->head) {
+ struct sk_buff *skb = dequeue_head(bucket);
+
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch, to_free);
+ }
+
+ /* Return id of the bucket from which the packet was dropped. */
+ return bucket - q->buckets;
+}
+
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ enum wdrr_bucket_idx idx;
+ struct wdrr_bucket *bucket;
+ unsigned int prev_backlog;
+
+ idx = hhf_classify(skb, sch);
+
+ bucket = &q->buckets[idx];
+ bucket_add(bucket, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ if (list_empty(&bucket->bucketchain)) {
+ unsigned int weight;
+
+ /* The logic of new_buckets vs. old_buckets is the same as
+ * new_flows vs. old_flows in the implementation of fq_codel,
+ * i.e., short bursts of non-HHs should have strict priority.
+ */
+ if (idx == WDRR_BUCKET_FOR_HH) {
+ /* Always move heavy-hitters to old bucket. */
+ weight = 1;
+ list_add_tail(&bucket->bucketchain, &q->old_buckets);
+ } else {
+ weight = q->hhf_non_hh_weight;
+ list_add_tail(&bucket->bucketchain, &q->new_buckets);
+ }
+ bucket->deficit = weight * q->quantum;
+ }
+ if (++sch->q.qlen <= sch->limit)
+ return NET_XMIT_SUCCESS;
+
+ prev_backlog = sch->qstats.backlog;
+ q->drop_overlimit++;
+ /* Return Congestion Notification only if we dropped a packet from this
+ * bucket.
+ */
+ if (hhf_drop(sch, to_free) == idx)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this. */
+ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct wdrr_bucket *bucket;
+ struct list_head *head;
+
+begin:
+ head = &q->new_buckets;
+ if (list_empty(head)) {
+ head = &q->old_buckets;
+ if (list_empty(head))
+ return NULL;
+ }
+ bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
+
+ if (bucket->deficit <= 0) {
+ int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
+ 1 : q->hhf_non_hh_weight;
+
+ bucket->deficit += weight * q->quantum;
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ goto begin;
+ }
+
+ if (bucket->head) {
+ skb = dequeue_head(bucket);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ }
+
+ if (!skb) {
+ /* Force a pass through old_buckets to prevent starvation. */
+ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ else
+ list_del_init(&bucket->bucketchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ bucket->deficit -= qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+static void hhf_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = hhf_dequeue(sch)) != NULL)
+ rtnl_kfree_skbs(skb, skb);
+}
+
+static void hhf_destroy(struct Qdisc *sch)
+{
+ int i;
+ struct hhf_sched_data *q = qdisc_priv(sch);
+
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ kvfree(q->hhf_arrays[i]);
+ kvfree(q->hhf_valid_bits[i]);
+ }
+
+ if (!q->hh_flows)
+ return;
+
+ for (i = 0; i < HH_FLOWS_CNT; i++) {
+ struct hh_flow_state *flow, *next;
+ struct list_head *head = &q->hh_flows[i];
+
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ list_del(&flow->flowchain);
+ kfree(flow);
+ }
+ }
+ kvfree(q->hh_flows);
+}
+
+static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
+ [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_QUANTUM] = { .type = NLA_U32 },
+ [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
+ [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
+};
+
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_HHF_MAX + 1];
+ unsigned int qlen, prev_backlog;
+ int err;
+ u64 non_hh_quantum;
+ u32 new_quantum = q->quantum;
+ u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_HHF_QUANTUM])
+ new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
+
+ if (tb[TCA_HHF_NON_HH_WEIGHT])
+ new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
+
+ non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
+ if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
+ return -EINVAL;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_HHF_BACKLOG_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+
+ q->quantum = new_quantum;
+ q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+
+ if (tb[TCA_HHF_HH_FLOWS_LIMIT])
+ q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+
+ if (tb[TCA_HHF_RESET_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
+
+ q->hhf_reset_timeout = usecs_to_jiffies(us);
+ }
+
+ if (tb[TCA_HHF_ADMIT_BYTES])
+ q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+
+ if (tb[TCA_HHF_EVICT_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
+
+ q->hhf_evict_timeout = usecs_to_jiffies(us);
+ }
+
+ qlen = sch->q.qlen;
+ prev_backlog = sch->qstats.backlog;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = hhf_dequeue(sch);
+
+ rtnl_kfree_skbs(skb, skb);
+ }
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
+ prev_backlog - sch->qstats.backlog);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ sch->limit = 1000;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
+ INIT_LIST_HEAD(&q->new_buckets);
+ INIT_LIST_HEAD(&q->old_buckets);
+
+ /* Configurable HHF parameters */
+ q->hhf_reset_timeout = HZ / 25; /* 40 ms */
+ q->hhf_admit_bytes = 131072; /* 128 KB */
+ q->hhf_evict_timeout = HZ; /* 1 sec */
+ q->hhf_non_hh_weight = 2;
+
+ if (opt) {
+ int err = hhf_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ if (!q->hh_flows) {
+ /* Initialize heavy-hitter flow table. */
+ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!q->hh_flows)
+ return -ENOMEM;
+ for (i = 0; i < HH_FLOWS_CNT; i++)
+ INIT_LIST_HEAD(&q->hh_flows[i]);
+
+ /* Cap max active HHs at twice len of hh_flows table. */
+ q->hh_flows_limit = 2 * HH_FLOWS_CNT;
+ q->hh_flows_overlimit = 0;
+ q->hh_flows_total_cnt = 0;
+ q->hh_flows_current_cnt = 0;
+
+ /* Initialize heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN,
+ sizeof(u32),
+ GFP_KERNEL);
+ if (!q->hhf_arrays[i]) {
+ /* Note: hhf_destroy() will be called
+ * by our caller.
+ */
+ return -ENOMEM;
+ }
+ }
+ q->hhf_arrays_reset_timestamp = hhf_time_stamp();
+
+ /* Initialize valid bits of heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN /
+ BITS_PER_BYTE, GFP_KERNEL);
+ if (!q->hhf_valid_bits[i]) {
+ /* Note: hhf_destroy() will be called
+ * by our caller.
+ */
+ return -ENOMEM;
+ }
+ }
+
+ /* Initialize Weighted DRR buckets. */
+ for (i = 0; i < WDRR_BUCKET_CNT; i++) {
+ struct wdrr_bucket *bucket = q->buckets + i;
+
+ INIT_LIST_HEAD(&bucket->bucketchain);
+ }
+ }
+
+ return 0;
+}
+
+static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+ nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
+ jiffies_to_usecs(q->hhf_reset_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+ nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
+ jiffies_to_usecs(q->hhf_evict_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct tc_hhf_xstats st = {
+ .drop_overlimit = q->drop_overlimit,
+ .hh_overlimit = q->hh_flows_overlimit,
+ .hh_tot_count = q->hh_flows_total_cnt,
+ .hh_cur_count = q->hh_flows_current_cnt,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
+ .id = "hhf",
+ .priv_size = sizeof(struct hhf_sched_data),
+
+ .enqueue = hhf_enqueue,
+ .dequeue = hhf_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = hhf_init,
+ .reset = hhf_reset,
+ .destroy = hhf_destroy,
+ .change = hhf_change,
+ .dump = hhf_dump,
+ .dump_stats = hhf_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init hhf_module_init(void)
+{
+ return register_qdisc(&hhf_qdisc_ops);
+}
+
+static void __exit hhf_module_exit(void)
+{
+ unregister_qdisc(&hhf_qdisc_ops);
+}
+
+module_init(hhf_module_init)
+module_exit(hhf_module_exit)
+MODULE_AUTHOR("Terry Lam");
+MODULE_AUTHOR("Nandita Dukkipati");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
new file mode 100644
index 000000000..862a33b9e
--- /dev/null
+++ b/net/sched/sch_htb.c
@@ -0,0 +1,1621 @@
+/*
+ * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Martin Devera, <devik@cdi.cz>
+ *
+ * Credits (in time order) for older HTB versions:
+ * Stef Coene <stef.coene@docum.org>
+ * HTB support at LARTC mailing list
+ * Ondrej Kraus, <krauso@barr.cz>
+ * found missing INIT_QDISC(htb)
+ * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
+ * helped a lot to locate nasty class stall bug
+ * Andi Kleen, Jamal Hadi, Bert Hubert
+ * code review and helpful comments on shaping
+ * Tomasz Wrona, <tw@eter.tym.pl>
+ * created test case so that I was able to fix nasty bug
+ * Wilfried Weissmann
+ * spotted bug in dequeue code and helped with fix
+ * Jiri Fojtasek
+ * fixed requeue routine
+ * and many others. thanks.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+/* HTB algorithm.
+ Author: devik@cdi.cz
+ ========================================================================
+ HTB is like TBF with multiple classes. It is also similar to CBQ because
+ it allows to assign priority to each class in hierarchy.
+ In fact it is another implementation of Floyd's formal sharing.
+
+ Levels:
+ Each class is assigned level. Leaf has ALWAYS level 0 and root
+ classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
+ one less than their parent.
+*/
+
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
+#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
+
+#if HTB_VER >> 16 != TC_HTB_PROTOVER
+#error "Mismatched sch_htb.c and pkt_sch.h"
+#endif
+
+/* Module parameter and sysfs export */
+module_param (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
+/* used internaly to keep status of single class */
+enum htb_cmode {
+ HTB_CANT_SEND, /* class can't send and can't borrow */
+ HTB_MAY_BORROW, /* class can't send but may borrow */
+ HTB_CAN_SEND /* class can send */
+};
+
+struct htb_prio {
+ union {
+ struct rb_root row;
+ struct rb_root feed;
+ };
+ struct rb_node *ptr;
+ /* When class changes from state 1->2 and disconnects from
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
+ u32 last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
+struct htb_class {
+ struct Qdisc_class_common common;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg ceil;
+ s64 buffer, cbuffer;/* token bucket depth/rate */
+ s64 mbuffer; /* max wait time */
+ u32 prio; /* these two are used only by leaves... */
+ int quantum; /* but stored for parent-to-leaf return */
+
+ struct tcf_proto __rcu *filter_list; /* class attached filters */
+ struct tcf_block *block;
+ int filter_cnt;
+
+ int level; /* our level (see above) */
+ unsigned int children;
+ struct htb_class *parent; /* parent class */
+
+ struct net_rate_estimator __rcu *rate_est;
+
+ /*
+ * Written often fields
+ */
+ struct gnet_stats_basic_packed bstats;
+ struct tc_htb_xstats xstats; /* our special stats */
+
+ /* token bucket parameters */
+ s64 tokens, ctokens;/* current number of tokens */
+ s64 t_c; /* checkpoint time */
+
+ union {
+ struct htb_class_leaf {
+ int deficit[TC_HTB_MAXDEPTH];
+ struct Qdisc *q;
+ } leaf;
+ struct htb_class_inner {
+ struct htb_prio clprio[TC_HTB_NUMPRIO];
+ } inner;
+ } un;
+ s64 pq_key;
+
+ int prio_activity; /* for which prios are we active */
+ enum htb_cmode cmode; /* current mode of the class */
+ struct rb_node pq_node; /* node for event queue */
+ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+
+ unsigned int drops ____cacheline_aligned_in_smp;
+ unsigned int overlimits;
+};
+
+struct htb_level {
+ struct rb_root wait_pq;
+ struct htb_prio hprio[TC_HTB_NUMPRIO];
+};
+
+struct htb_sched {
+ struct Qdisc_class_hash clhash;
+ int defcls; /* class where unclassified flows go to */
+ int rate2quantum; /* quant = rate / rate2quantum */
+
+ /* filters for qdisc itself */
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+
+#define HTB_WARN_TOOMANYEVENTS 0x1
+ unsigned int warned; /* only one warning */
+ int direct_qlen;
+ struct work_struct work;
+
+ /* non shaped skbs; let them go directly thru */
+ struct qdisc_skb_head direct_queue;
+ long direct_pkts;
+
+ struct qdisc_watchdog watchdog;
+
+ s64 now; /* cached dequeue time */
+
+ /* time of nearest event per level (row) */
+ s64 near_ev_cache[TC_HTB_MAXDEPTH];
+
+ int row_mask[TC_HTB_MAXDEPTH];
+
+ struct htb_level hlevel[TC_HTB_MAXDEPTH];
+};
+
+/* find class in global hash table using given handle */
+static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, handle);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct htb_class, common);
+}
+
+static unsigned long htb_search(struct Qdisc *sch, u32 handle)
+{
+ return (unsigned long)htb_find(handle, sch);
+}
+/**
+ * htb_classify - classify a packet into class
+ *
+ * It returns NULL if the packet should be dropped or -1 if the packet
+ * should be passed directly thru. In all other cases leaf class is returned.
+ * We allow direct class selection by classid in priority. The we examine
+ * filters in qdisc and in inner nodes (if higher filter points to the inner
+ * node). If we end up with classid MAJOR:0 we enqueue the skb into special
+ * internal fifo (direct). These packets then go directly thru. If we still
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
+ * then finish and return direct queue.
+ */
+#define HTB_DIRECT ((struct htb_class *)-1L)
+
+static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *tcf;
+ int result;
+
+ /* allow to select class by setting skb->priority to valid classid;
+ * note that nfmark can be used too by attaching filter fw with no
+ * rules in it
+ */
+ if (skb->priority == sch->handle)
+ return HTB_DIRECT; /* X:0 (direct flow) selected */
+ cl = htb_find(skb->priority, sch);
+ if (cl) {
+ if (cl->level == 0)
+ return cl;
+ /* Start with inner filter chain if a non-leaf class is selected */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ } else {
+ tcf = rcu_dereference_bh(q->filter_list);
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (void *)res.class;
+ if (!cl) {
+ if (res.classid == sch->handle)
+ return HTB_DIRECT; /* X:0 (direct flow) */
+ cl = htb_find(res.classid, sch);
+ if (!cl)
+ break; /* filter selected invalid classid */
+ }
+ if (!cl->level)
+ return cl; /* we hit leaf; return it */
+
+ /* we have got inner class; apply inner filter chain */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ }
+ /* classification failed; try to use default class */
+ cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+ if (!cl || cl->level)
+ return HTB_DIRECT; /* bad default .. this is safe bet */
+ return cl;
+}
+
+/**
+ * htb_add_to_id_tree - adds class to the round robin list
+ *
+ * Routine adds class to the list (actually tree) sorted by classid.
+ * Make sure that class is not already on such list for given prio.
+ */
+static void htb_add_to_id_tree(struct rb_root *root,
+ struct htb_class *cl, int prio)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+
+ while (*p) {
+ struct htb_class *c;
+ parent = *p;
+ c = rb_entry(parent, struct htb_class, node[prio]);
+
+ if (cl->common.classid > c->common.classid)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->node[prio], parent, p);
+ rb_insert_color(&cl->node[prio], root);
+}
+
+/**
+ * htb_add_to_wait_tree - adds class to the event queue with delay
+ *
+ * The class is added to priority event queue to indicate that class will
+ * change its mode in cl->pq_key microseconds. Make sure that class is not
+ * already in the queue.
+ */
+static void htb_add_to_wait_tree(struct htb_sched *q,
+ struct htb_class *cl, s64 delay)
+{
+ struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
+
+ cl->pq_key = q->now + delay;
+ if (cl->pq_key == q->now)
+ cl->pq_key++;
+
+ /* update the nearest event cache */
+ if (q->near_ev_cache[cl->level] > cl->pq_key)
+ q->near_ev_cache[cl->level] = cl->pq_key;
+
+ while (*p) {
+ struct htb_class *c;
+ parent = *p;
+ c = rb_entry(parent, struct htb_class, pq_node);
+ if (cl->pq_key >= c->pq_key)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->pq_node, parent, p);
+ rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
+}
+
+/**
+ * htb_next_rb_node - finds next node in binary tree
+ *
+ * When we are past last key we return NULL.
+ * Average complexity is 2 steps per call.
+ */
+static inline void htb_next_rb_node(struct rb_node **n)
+{
+ *n = rb_next(*n);
+}
+
+/**
+ * htb_add_class_to_row - add class to its row
+ *
+ * The class is added to row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_add_class_to_row(struct htb_sched *q,
+ struct htb_class *cl, int mask)
+{
+ q->row_mask[cl->level] |= mask;
+ while (mask) {
+ int prio = ffz(~mask);
+ mask &= ~(1 << prio);
+ htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
+ }
+}
+
+/* If this triggers, it is a bug in this code, but it need not be fatal */
+static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
+{
+ if (RB_EMPTY_NODE(rb)) {
+ WARN_ON(1);
+ } else {
+ rb_erase(rb, root);
+ RB_CLEAR_NODE(rb);
+ }
+}
+
+
+/**
+ * htb_remove_class_from_row - removes class from its row
+ *
+ * The class is removed from row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_remove_class_from_row(struct htb_sched *q,
+ struct htb_class *cl, int mask)
+{
+ int m = 0;
+ struct htb_level *hlevel = &q->hlevel[cl->level];
+
+ while (mask) {
+ int prio = ffz(~mask);
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
+ mask &= ~(1 << prio);
+ if (hprio->ptr == cl->node + prio)
+ htb_next_rb_node(&hprio->ptr);
+
+ htb_safe_rb_erase(cl->node + prio, &hprio->row);
+ if (!hprio->row.rb_node)
+ m |= 1 << prio;
+ }
+ q->row_mask[cl->level] &= ~m;
+}
+
+/**
+ * htb_activate_prios - creates active classe's feed chain
+ *
+ * The class is connected to ancestors and/or appropriate rows
+ * for priorities it is participating on. cl->cmode must be new
+ * (activated) mode. It does nothing if cl->prio_activity == 0.
+ */
+static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+ struct htb_class *p = cl->parent;
+ long m, mask = cl->prio_activity;
+
+ while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+ m = mask;
+ while (m) {
+ int prio = ffz(~m);
+ m &= ~(1 << prio);
+
+ if (p->un.inner.clprio[prio].feed.rb_node)
+ /* parent already has its feed in use so that
+ * reset bit in mask as parent is already ok
+ */
+ mask &= ~(1 << prio);
+
+ htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
+ }
+ p->prio_activity |= mask;
+ cl = p;
+ p = cl->parent;
+
+ }
+ if (cl->cmode == HTB_CAN_SEND && mask)
+ htb_add_class_to_row(q, cl, mask);
+}
+
+/**
+ * htb_deactivate_prios - remove class from feed chain
+ *
+ * cl->cmode must represent old mode (before deactivation). It does
+ * nothing if cl->prio_activity == 0. Class is removed from all feed
+ * chains and rows.
+ */
+static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+ struct htb_class *p = cl->parent;
+ long m, mask = cl->prio_activity;
+
+ while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+ m = mask;
+ mask = 0;
+ while (m) {
+ int prio = ffz(~m);
+ m &= ~(1 << prio);
+
+ if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
+ /* we are removing child which is pointed to from
+ * parent feed - forget the pointer but remember
+ * classid
+ */
+ p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->un.inner.clprio[prio].ptr = NULL;
+ }
+
+ htb_safe_rb_erase(cl->node + prio,
+ &p->un.inner.clprio[prio].feed);
+
+ if (!p->un.inner.clprio[prio].feed.rb_node)
+ mask |= 1 << prio;
+ }
+
+ p->prio_activity &= ~mask;
+ cl = p;
+ p = cl->parent;
+
+ }
+ if (cl->cmode == HTB_CAN_SEND && mask)
+ htb_remove_class_from_row(q, cl, mask);
+}
+
+static inline s64 htb_lowater(const struct htb_class *cl)
+{
+ if (htb_hysteresis)
+ return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+ else
+ return 0;
+}
+static inline s64 htb_hiwater(const struct htb_class *cl)
+{
+ if (htb_hysteresis)
+ return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+ else
+ return 0;
+}
+
+
+/**
+ * htb_class_mode - computes and returns current class mode
+ *
+ * It computes cl's mode at time cl->t_c+diff and returns it. If mode
+ * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
+ * from now to time when cl will change its state.
+ * Also it is worth to note that class mode doesn't change simply
+ * at cl->{c,}tokens == 0 but there can rather be hysteresis of
+ * 0 .. -cl->{c,}buffer range. It is meant to limit number of
+ * mode transitions per time unit. The speed gain is about 1/6.
+ */
+static inline enum htb_cmode
+htb_class_mode(struct htb_class *cl, s64 *diff)
+{
+ s64 toks;
+
+ if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
+ *diff = -toks;
+ return HTB_CANT_SEND;
+ }
+
+ if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
+ return HTB_CAN_SEND;
+
+ *diff = -toks;
+ return HTB_MAY_BORROW;
+}
+
+/**
+ * htb_change_class_mode - changes classe's mode
+ *
+ * This should be the only way how to change classe's mode under normal
+ * cirsumstances. Routine will update feed lists linkage, change mode
+ * and add class to the wait event queue if appropriate. New mode should
+ * be different from old one and cl->pq_key has to be valid if changing
+ * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
+ */
+static void
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
+{
+ enum htb_cmode new_mode = htb_class_mode(cl, diff);
+
+ if (new_mode == cl->cmode)
+ return;
+
+ if (new_mode == HTB_CANT_SEND)
+ cl->overlimits++;
+
+ if (cl->prio_activity) { /* not necessary: speed optimization */
+ if (cl->cmode != HTB_CANT_SEND)
+ htb_deactivate_prios(q, cl);
+ cl->cmode = new_mode;
+ if (new_mode != HTB_CANT_SEND)
+ htb_activate_prios(q, cl);
+ } else
+ cl->cmode = new_mode;
+}
+
+/**
+ * htb_activate - inserts leaf cl into appropriate active feeds
+ *
+ * Routine learns (new) priority of leaf and activates feed chain
+ * for the prio. It can be called on already active leaf safely.
+ * It also adds leaf into droplist.
+ */
+static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
+{
+ WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+
+ if (!cl->prio_activity) {
+ cl->prio_activity = 1 << cl->prio;
+ htb_activate_prios(q, cl);
+ }
+}
+
+/**
+ * htb_deactivate - remove leaf cl from active feeds
+ *
+ * Make sure that leaf is active. In the other words it can't be called
+ * with non-active leaf. It also removes class from the drop list.
+ */
+static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
+{
+ WARN_ON(!cl->prio_activity);
+
+ htb_deactivate_prios(q, cl);
+ cl->prio_activity = 0;
+}
+
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+ struct qdisc_skb_head *qh)
+{
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
+}
+
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ int uninitialized_var(ret);
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = htb_classify(skb, sch, &ret);
+
+ if (cl == HTB_DIRECT) {
+ /* enqueue to helper queue */
+ if (q->direct_queue.qlen < q->direct_qlen) {
+ htb_enqueue_tail(skb, sch, &q->direct_queue);
+ q->direct_pkts++;
+ } else {
+ return qdisc_drop(skb, sch, to_free);
+ }
+#ifdef CONFIG_NET_CLS_ACT
+ } else if (!cl) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+#endif
+ } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+ to_free)) != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ cl->drops++;
+ }
+ return ret;
+ } else {
+ htb_activate(q, cl);
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+}
+
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
+{
+ s64 toks = diff + cl->tokens;
+
+ if (toks > cl->buffer)
+ toks = cl->buffer;
+ toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
+ if (toks <= -cl->mbuffer)
+ toks = 1 - cl->mbuffer;
+
+ cl->tokens = toks;
+}
+
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
+{
+ s64 toks = diff + cl->ctokens;
+
+ if (toks > cl->cbuffer)
+ toks = cl->cbuffer;
+ toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
+ if (toks <= -cl->mbuffer)
+ toks = 1 - cl->mbuffer;
+
+ cl->ctokens = toks;
+}
+
+/**
+ * htb_charge_class - charges amount "bytes" to leaf and ancestors
+ *
+ * Routine assumes that packet "bytes" long was dequeued from leaf cl
+ * borrowing from "level". It accounts bytes to ceil leaky bucket for
+ * leaf and all ancestors and to rate bucket for ancestors at levels
+ * "level" and higher. It also handles possible change of mode resulting
+ * from the update. Note that mode can also increase here (MAY_BORROW to
+ * CAN_SEND) because we can use more precise clock that event queue here.
+ * In such case we remove class from event queue first.
+ */
+static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
+ int level, struct sk_buff *skb)
+{
+ int bytes = qdisc_pkt_len(skb);
+ enum htb_cmode old_mode;
+ s64 diff;
+
+ while (cl) {
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
+ if (cl->level >= level) {
+ if (cl->level == level)
+ cl->xstats.lends++;
+ htb_accnt_tokens(cl, bytes, diff);
+ } else {
+ cl->xstats.borrows++;
+ cl->tokens += diff; /* we moved t_c; update tokens */
+ }
+ htb_accnt_ctokens(cl, bytes, diff);
+ cl->t_c = q->now;
+
+ old_mode = cl->cmode;
+ diff = 0;
+ htb_change_class_mode(q, cl, &diff);
+ if (old_mode != cl->cmode) {
+ if (old_mode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_add_to_wait_tree(q, cl, diff);
+ }
+
+ /* update basic stats except for leaves which are already updated */
+ if (cl->level)
+ bstats_update(&cl->bstats, skb);
+
+ cl = cl->parent;
+ }
+}
+
+/**
+ * htb_do_events - make mode changes to classes at the level
+ *
+ * Scans event queue for pending events and applies them. Returns time of
+ * next pending event (0 for no event in pq, q->now for too many events).
+ * Note: Applied are events whose have cl->pq_key <= q->now.
+ */
+static s64 htb_do_events(struct htb_sched *q, const int level,
+ unsigned long start)
+{
+ /* don't run for longer than 2 jiffies; 2 is used instead of
+ * 1 to simplify things when jiffy is going to be incremented
+ * too soon
+ */
+ unsigned long stop_at = start + 2;
+ struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
+ while (time_before(jiffies, stop_at)) {
+ struct htb_class *cl;
+ s64 diff;
+ struct rb_node *p = rb_first(wait_pq);
+
+ if (!p)
+ return 0;
+
+ cl = rb_entry(p, struct htb_class, pq_node);
+ if (cl->pq_key > q->now)
+ return cl->pq_key;
+
+ htb_safe_rb_erase(p, wait_pq);
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
+ htb_change_class_mode(q, cl, &diff);
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_add_to_wait_tree(q, cl, diff);
+ }
+
+ /* too much load - let's continue after a break for scheduling */
+ if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
+ pr_warn("htb: too many events!\n");
+ q->warned |= HTB_WARN_TOOMANYEVENTS;
+ }
+
+ return q->now;
+}
+
+/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
+ * is no such one exists.
+ */
+static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
+ u32 id)
+{
+ struct rb_node *r = NULL;
+ while (n) {
+ struct htb_class *cl =
+ rb_entry(n, struct htb_class, node[prio]);
+
+ if (id > cl->common.classid) {
+ n = n->rb_right;
+ } else if (id < cl->common.classid) {
+ r = n;
+ n = n->rb_left;
+ } else {
+ return n;
+ }
+ }
+ return r;
+}
+
+/**
+ * htb_lookup_leaf - returns next leaf class in DRR order
+ *
+ * Find leaf where current feed pointers points to.
+ */
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
+{
+ int i;
+ struct {
+ struct rb_node *root;
+ struct rb_node **pptr;
+ u32 *pid;
+ } stk[TC_HTB_MAXDEPTH], *sp = stk;
+
+ BUG_ON(!hprio->row.rb_node);
+ sp->root = hprio->row.rb_node;
+ sp->pptr = &hprio->ptr;
+ sp->pid = &hprio->last_ptr_id;
+
+ for (i = 0; i < 65535; i++) {
+ if (!*sp->pptr && *sp->pid) {
+ /* ptr was invalidated but id is valid - try to recover
+ * the original or next ptr
+ */
+ *sp->pptr =
+ htb_id_find_next_upper(prio, sp->root, *sp->pid);
+ }
+ *sp->pid = 0; /* ptr is valid now so that remove this hint as it
+ * can become out of date quickly
+ */
+ if (!*sp->pptr) { /* we are at right end; rewind & go up */
+ *sp->pptr = sp->root;
+ while ((*sp->pptr)->rb_left)
+ *sp->pptr = (*sp->pptr)->rb_left;
+ if (sp > stk) {
+ sp--;
+ if (!*sp->pptr) {
+ WARN_ON(1);
+ return NULL;
+ }
+ htb_next_rb_node(sp->pptr);
+ }
+ } else {
+ struct htb_class *cl;
+ struct htb_prio *clp;
+
+ cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
+ if (!cl->level)
+ return cl;
+ clp = &cl->un.inner.clprio[prio];
+ (++sp)->root = clp->feed.rb_node;
+ sp->pptr = &clp->ptr;
+ sp->pid = &clp->last_ptr_id;
+ }
+ }
+ WARN_ON(1);
+ return NULL;
+}
+
+/* dequeues packet at given priority and level; call only if
+ * you are sure that there is active class at prio/level
+ */
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+ const int level)
+{
+ struct sk_buff *skb = NULL;
+ struct htb_class *cl, *start;
+ struct htb_level *hlevel = &q->hlevel[level];
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
+ /* look initial class up in the row */
+ start = cl = htb_lookup_leaf(hprio, prio);
+
+ do {
+next:
+ if (unlikely(!cl))
+ return NULL;
+
+ /* class can be empty - it is unlikely but can be true if leaf
+ * qdisc drops packets in enqueue routine or if someone used
+ * graft operation on the leaf since last dequeue;
+ * simply deactivate and skip such class
+ */
+ if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+ struct htb_class *next;
+ htb_deactivate(q, cl);
+
+ /* row/level might become empty */
+ if ((q->row_mask[level] & (1 << prio)) == 0)
+ return NULL;
+
+ next = htb_lookup_leaf(hprio, prio);
+
+ if (cl == start) /* fix start if we just deleted it */
+ start = next;
+ cl = next;
+ goto next;
+ }
+
+ skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+ if (likely(skb != NULL))
+ break;
+
+ qdisc_warn_nonwc("htb", cl->un.leaf.q);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ &q->hlevel[0].hprio[prio].ptr);
+ cl = htb_lookup_leaf(hprio, prio);
+
+ } while (cl != start);
+
+ if (likely(skb != NULL)) {
+ bstats_update(&cl->bstats, skb);
+ cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
+ if (cl->un.leaf.deficit[level] < 0) {
+ cl->un.leaf.deficit[level] += cl->quantum;
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ &q->hlevel[0].hprio[prio].ptr);
+ }
+ /* this used to be after charge_class but this constelation
+ * gives us slightly better performance
+ */
+ if (!cl->un.leaf.q->q.qlen)
+ htb_deactivate(q, cl);
+ htb_charge_class(q, cl, level, skb);
+ }
+ return skb;
+}
+
+static struct sk_buff *htb_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct htb_sched *q = qdisc_priv(sch);
+ int level;
+ s64 next_event;
+ unsigned long start_at;
+
+ /* try to dequeue direct packets as high prio (!) to minimize cpu work */
+ skb = __qdisc_dequeue_head(&q->direct_queue);
+ if (skb != NULL) {
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+
+ if (!sch->q.qlen)
+ goto fin;
+ q->now = ktime_get_ns();
+ start_at = jiffies;
+
+ next_event = q->now + 5LLU * NSEC_PER_SEC;
+
+ for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
+ /* common case optimization - skip event handler quickly */
+ int m;
+ s64 event = q->near_ev_cache[level];
+
+ if (q->now >= event) {
+ event = htb_do_events(q, level, start_at);
+ if (!event)
+ event = q->now + NSEC_PER_SEC;
+ q->near_ev_cache[level] = event;
+ }
+
+ if (next_event > event)
+ next_event = event;
+
+ m = ~q->row_mask[level];
+ while (m != (int)(-1)) {
+ int prio = ffz(m);
+
+ m |= 1 << prio;
+ skb = htb_dequeue_tree(q, prio, level);
+ if (likely(skb != NULL))
+ goto ok;
+ }
+ }
+ qdisc_qstats_overlimit(sch);
+ if (likely(next_event > q->now))
+ qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
+ else
+ schedule_work(&q->work);
+fin:
+ return skb;
+}
+
+/* reset all classes */
+/* always caled under BH & queue lock */
+static void htb_reset(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->level)
+ memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+ else {
+ if (cl->un.leaf.q)
+ qdisc_reset(cl->un.leaf.q);
+ }
+ cl->prio_activity = 0;
+ cl->cmode = HTB_CAN_SEND;
+ }
+ }
+ qdisc_watchdog_cancel(&q->watchdog);
+ __qdisc_reset_queue(&q->direct_queue);
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+ memset(q->hlevel, 0, sizeof(q->hlevel));
+ memset(q->row_mask, 0, sizeof(q->row_mask));
+}
+
+static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
+ [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
+ [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
+ [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+ [TCA_HTB_RATE64] = { .type = NLA_U64 },
+ [TCA_HTB_CEIL64] = { .type = NLA_U64 },
+};
+
+static void htb_work_func(struct work_struct *work)
+{
+ struct htb_sched *q = container_of(work, struct htb_sched, work);
+ struct Qdisc *sch = q->watchdog.qdisc;
+
+ rcu_read_lock();
+ __netif_schedule(qdisc_root(sch));
+ rcu_read_unlock();
+}
+
+static int htb_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct tc_htb_glob *gopt;
+ int err;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+ INIT_WORK(&q->work, htb_work_func);
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_HTB_INIT])
+ return -EINVAL;
+
+ gopt = nla_data(tb[TCA_HTB_INIT]);
+ if (gopt->version != HTB_VER >> 16)
+ return -EINVAL;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+
+ qdisc_skb_head_init(&q->direct_queue);
+
+ if (tb[TCA_HTB_DIRECT_QLEN])
+ q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+ else
+ q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+
+ if ((q->rate2quantum = gopt->rate2quantum) < 1)
+ q->rate2quantum = 1;
+ q->defcls = gopt->defcls;
+
+ return 0;
+}
+
+static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+ struct tc_htb_glob gopt;
+
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the qdisc parameters.
+ */
+
+ gopt.direct_pkts = q->direct_pkts;
+ gopt.version = HTB_VER;
+ gopt.rate2quantum = q->rate2quantum;
+ gopt.defcls = q->defcls;
+ gopt.debug = 0;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+ nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ struct nlattr *nest;
+ struct tc_htb_opt opt;
+
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the class parameters.
+ */
+ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ if (!cl->level && cl->un.leaf.q)
+ tcm->tcm_info = cl->un.leaf.q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ memset(&opt, 0, sizeof(opt));
+
+ psched_ratecfg_getrate(&opt.rate, &cl->rate);
+ opt.buffer = PSCHED_NS2TICKS(cl->buffer);
+ psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
+ opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
+ opt.quantum = cl->quantum;
+ opt.prio = cl->prio;
+ opt.level = cl->level;
+ if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
+ TCA_HTB_PAD))
+ goto nla_put_failure;
+ if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
+ TCA_HTB_PAD))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ struct gnet_stats_queue qs = {
+ .drops = cl->drops,
+ .overlimits = cl->overlimits,
+ };
+ __u32 qlen = 0;
+
+ if (!cl->level && cl->un.leaf.q) {
+ qlen = cl->un.leaf.q->q.qlen;
+ qs.backlog = cl->un.leaf.q->qstats.backlog;
+ }
+ cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
+ INT_MIN, INT_MAX);
+ cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
+ INT_MIN, INT_MAX);
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (cl->level)
+ return -EINVAL;
+ if (new == NULL &&
+ (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid, extack)) == NULL)
+ return -ENOBUFS;
+
+ *old = qdisc_replace(sch, new, &cl->un.leaf.q);
+ return 0;
+}
+
+static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ return !cl->level ? cl->un.leaf.q : NULL;
+}
+
+static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ htb_deactivate(qdisc_priv(sch), cl);
+}
+
+static inline int htb_parent_last_child(struct htb_class *cl)
+{
+ if (!cl->parent)
+ /* the root class */
+ return 0;
+ if (cl->parent->children > 1)
+ /* not the last child */
+ return 0;
+ return 1;
+}
+
+static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+ struct Qdisc *new_q)
+{
+ struct htb_class *parent = cl->parent;
+
+ WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+
+ if (parent->cmode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&parent->pq_node,
+ &q->hlevel[parent->level].wait_pq);
+
+ parent->level = 0;
+ memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ parent->tokens = parent->buffer;
+ parent->ctokens = parent->cbuffer;
+ parent->t_c = ktime_get_ns();
+ parent->cmode = HTB_CAN_SEND;
+}
+
+static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
+{
+ if (!cl->level) {
+ WARN_ON(!cl->un.leaf.q);
+ qdisc_put(cl->un.leaf.q);
+ }
+ gen_kill_estimator(&cl->rate_est);
+ tcf_block_put(cl->block);
+ kfree(cl);
+}
+
+static void htb_destroy(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct htb_class *cl;
+ unsigned int i;
+
+ cancel_work_sync(&q->work);
+ qdisc_watchdog_cancel(&q->watchdog);
+ /* This line used to be after htb_destroy_class call below
+ * and surprisingly it worked in 2.4. But it must precede it
+ * because filter need its target class alive to be able to call
+ * unbind_filter on it (without Oops).
+ */
+ tcf_block_put(q->block);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ tcf_block_put(cl->block);
+ cl->block = NULL;
+ }
+ }
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode)
+ htb_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+ __qdisc_reset_queue(&q->direct_queue);
+}
+
+static int htb_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)arg;
+ struct Qdisc *new_q = NULL;
+ int last_child = 0;
+
+ /* TODO: why don't allow to delete subtree ? references ? does
+ * tc subsys guarantee us that in htb_destroy it holds no class
+ * refs so that we can remove children safely there ?
+ */
+ if (cl->children || cl->filter_cnt)
+ return -EBUSY;
+
+ if (!cl->level && htb_parent_last_child(cl)) {
+ new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->parent->common.classid,
+ NULL);
+ last_child = 1;
+ }
+
+ sch_tree_lock(sch);
+
+ if (!cl->level) {
+ unsigned int qlen = cl->un.leaf.q->q.qlen;
+ unsigned int backlog = cl->un.leaf.q->qstats.backlog;
+
+ qdisc_reset(cl->un.leaf.q);
+ qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
+ }
+
+ /* delete from hash and active; remainder in destroy_class */
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+ if (cl->parent)
+ cl->parent->children--;
+
+ if (cl->prio_activity)
+ htb_deactivate(q, cl);
+
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&cl->pq_node,
+ &q->hlevel[cl->level].wait_pq);
+
+ if (last_child)
+ htb_parent_to_leaf(q, cl, new_q);
+
+ sch_tree_unlock(sch);
+
+ htb_destroy_class(sch, cl);
+ return 0;
+}
+
+static int htb_change_class(struct Qdisc *sch, u32 classid,
+ u32 parentid, struct nlattr **tca,
+ unsigned long *arg, struct netlink_ext_ack *extack)
+{
+ int err = -EINVAL;
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)*arg, *parent;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct tc_htb_opt *hopt;
+ u64 rate64, ceil64;
+ int warn = 0;
+
+ /* extract all subattrs from opt attr */
+ if (!opt)
+ goto failure;
+
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ if (err < 0)
+ goto failure;
+
+ err = -EINVAL;
+ if (tb[TCA_HTB_PARMS] == NULL)
+ goto failure;
+
+ parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
+
+ hopt = nla_data(tb[TCA_HTB_PARMS]);
+ if (!hopt->rate.rate || !hopt->ceil.rate)
+ goto failure;
+
+ /* Keeping backward compatible with rate_table based iproute2 tc */
+ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
+ NULL));
+
+ if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
+ NULL));
+
+ if (!cl) { /* new class */
+ struct Qdisc *new_q;
+ int prio;
+ struct {
+ struct nlattr nla;
+ struct gnet_estimator opt;
+ } est = {
+ .nla = {
+ .nla_len = nla_attr_size(sizeof(est.opt)),
+ .nla_type = TCA_RATE,
+ },
+ .opt = {
+ /* 4s interval, 16s averaging constant */
+ .interval = 2,
+ .ewma_log = 2,
+ },
+ };
+
+ /* check for valid classid */
+ if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+ htb_find(classid, sch))
+ goto failure;
+
+ /* check maximal depth */
+ if (parent && parent->parent && parent->parent->level < 2) {
+ pr_err("htb: tree is too deep\n");
+ goto failure;
+ }
+ err = -ENOBUFS;
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
+ goto failure;
+
+ err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
+ if (htb_rate_est || tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE] ? : &est.nla);
+ if (err) {
+ tcf_block_put(cl->block);
+ kfree(cl);
+ goto failure;
+ }
+ }
+
+ cl->children = 0;
+ RB_CLEAR_NODE(&cl->pq_node);
+
+ for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
+ RB_CLEAR_NODE(&cl->node[prio]);
+
+ /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
+ * so that can't be used inside of sch_tree_lock
+ * -- thanks to Karlis Peisenieks
+ */
+ new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ classid, NULL);
+ sch_tree_lock(sch);
+ if (parent && !parent->level) {
+ unsigned int qlen = parent->un.leaf.q->q.qlen;
+ unsigned int backlog = parent->un.leaf.q->qstats.backlog;
+
+ /* turn parent into inner node */
+ qdisc_reset(parent->un.leaf.q);
+ qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
+ qdisc_put(parent->un.leaf.q);
+ if (parent->prio_activity)
+ htb_deactivate(q, parent);
+
+ /* remove from evt list because of level change */
+ if (parent->cmode != HTB_CAN_SEND) {
+ htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
+ parent->cmode = HTB_CAN_SEND;
+ }
+ parent->level = (parent->parent ? parent->parent->level
+ : TC_HTB_MAXDEPTH) - 1;
+ memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ }
+ /* leaf (we) needs elementary qdisc */
+ cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+
+ cl->common.classid = classid;
+ cl->parent = parent;
+
+ /* set class to be in HTB_CAN_SEND state */
+ cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
+ cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
+ cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */
+ cl->t_c = ktime_get_ns();
+ cl->cmode = HTB_CAN_SEND;
+
+ /* attach to the hash list and parent's family */
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ if (parent)
+ parent->children++;
+ if (cl->un.leaf.q != &noop_qdisc)
+ qdisc_hash_add(cl->un.leaf.q, true);
+ } else {
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+ sch_tree_lock(sch);
+ }
+
+ rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+ ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+ psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+ psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+
+ /* it used to be a nasty bug here, we have to check that node
+ * is really leaf before changing cl->un.leaf !
+ */
+ if (!cl->level) {
+ u64 quantum = cl->rate.rate_bytes_ps;
+
+ do_div(quantum, q->rate2quantum);
+ cl->quantum = min_t(u64, quantum, INT_MAX);
+
+ if (!hopt->quantum && cl->quantum < 1000) {
+ warn = -1;
+ cl->quantum = 1000;
+ }
+ if (!hopt->quantum && cl->quantum > 200000) {
+ warn = 1;
+ cl->quantum = 200000;
+ }
+ if (hopt->quantum)
+ cl->quantum = hopt->quantum;
+ if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
+ cl->prio = TC_HTB_NUMPRIO - 1;
+ }
+
+ cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
+ cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
+
+ sch_tree_unlock(sch);
+
+ if (warn)
+ pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
+ cl->common.classid, (warn == -1 ? "small" : "big"));
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+failure:
+ return err;
+}
+
+static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ return cl ? cl->block : q->block;
+}
+
+static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct htb_class *cl = htb_find(classid, sch);
+
+ /*if (cl && !cl->level) return 0;
+ * The line above used to be there to prevent attaching filters to
+ * leaves. But at least tc_index filter uses this just to get class
+ * for other reasons so that we have to allow for it.
+ * ----
+ * 19.6.2002 As Werner explained it is ok - bind filter is just
+ * another way to "lock" the class - unlike "get" this lock can
+ * be broken by class during destroy IIUC.
+ */
+ if (cl)
+ cl->filter_cnt++;
+ return (unsigned long)cl;
+}
+
+static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (cl)
+ cl->filter_cnt--;
+}
+
+static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops htb_class_ops = {
+ .graft = htb_graft,
+ .leaf = htb_leaf,
+ .qlen_notify = htb_qlen_notify,
+ .find = htb_search,
+ .change = htb_change_class,
+ .delete = htb_delete,
+ .walk = htb_walk,
+ .tcf_block = htb_tcf_block,
+ .bind_tcf = htb_bind_filter,
+ .unbind_tcf = htb_unbind_filter,
+ .dump = htb_dump_class,
+ .dump_stats = htb_dump_class_stats,
+};
+
+static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
+ .cl_ops = &htb_class_ops,
+ .id = "htb",
+ .priv_size = sizeof(struct htb_sched),
+ .enqueue = htb_enqueue,
+ .dequeue = htb_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = htb_init,
+ .reset = htb_reset,
+ .destroy = htb_destroy,
+ .dump = htb_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init htb_module_init(void)
+{
+ return register_qdisc(&htb_qdisc_ops);
+}
+static void __exit htb_module_exit(void)
+{
+ unregister_qdisc(&htb_qdisc_ops);
+}
+
+module_init(htb_module_init)
+module_exit(htb_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
new file mode 100644
index 000000000..ce3f55259
--- /dev/null
+++ b/net/sched/sch_ingress.c
@@ -0,0 +1,300 @@
+/* net/sched/sch_ingress.c - Ingress and clsact qdisc
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim 1999
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct ingress_sched_data {
+ struct tcf_block *block;
+ struct tcf_block_ext_info block_info;
+ struct mini_Qdisc_pair miniqp;
+};
+
+static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long ingress_find(struct Qdisc *sch, u32 classid)
+{
+ return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long ingress_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return ingress_find(sch, classid);
+}
+
+static void ingress_unbind_filter(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+}
+
+static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ return q->block;
+}
+
+static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
+{
+ struct mini_Qdisc_pair *miniqp = priv;
+
+ mini_qdisc_pair_swap(miniqp, tp_head);
+};
+
+static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ q->block_info.block_index = block_index;
+}
+
+static u32 ingress_ingress_block_get(struct Qdisc *sch)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ return q->block_info.block_index;
+}
+
+static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ net_inc_ingress_queue();
+
+ mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
+
+ q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->block_info.chain_head_change = clsact_chain_head_change;
+ q->block_info.chain_head_change_priv = &q->miniqp;
+
+ return tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+}
+
+static void ingress_destroy(struct Qdisc *sch)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put_ext(q->block, sch, &q->block_info);
+ net_dec_ingress_queue();
+}
+
+static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static const struct Qdisc_class_ops ingress_class_ops = {
+ .leaf = ingress_leaf,
+ .find = ingress_find,
+ .walk = ingress_walk,
+ .tcf_block = ingress_tcf_block,
+ .bind_tcf = ingress_bind_filter,
+ .unbind_tcf = ingress_unbind_filter,
+};
+
+static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
+ .cl_ops = &ingress_class_ops,
+ .id = "ingress",
+ .priv_size = sizeof(struct ingress_sched_data),
+ .static_flags = TCQ_F_CPUSTATS,
+ .init = ingress_init,
+ .destroy = ingress_destroy,
+ .dump = ingress_dump,
+ .ingress_block_set = ingress_ingress_block_set,
+ .ingress_block_get = ingress_ingress_block_get,
+ .owner = THIS_MODULE,
+};
+
+struct clsact_sched_data {
+ struct tcf_block *ingress_block;
+ struct tcf_block *egress_block;
+ struct tcf_block_ext_info ingress_block_info;
+ struct tcf_block_ext_info egress_block_info;
+ struct mini_Qdisc_pair miniqp_ingress;
+ struct mini_Qdisc_pair miniqp_egress;
+};
+
+static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
+{
+ switch (TC_H_MIN(classid)) {
+ case TC_H_MIN(TC_H_MIN_INGRESS):
+ case TC_H_MIN(TC_H_MIN_EGRESS):
+ return TC_H_MIN(classid);
+ default:
+ return 0;
+ }
+}
+
+static unsigned long clsact_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return clsact_find(sch, classid);
+}
+
+static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ switch (cl) {
+ case TC_H_MIN(TC_H_MIN_INGRESS):
+ return q->ingress_block;
+ case TC_H_MIN(TC_H_MIN_EGRESS):
+ return q->egress_block;
+ default:
+ return NULL;
+ }
+}
+
+static void clsact_ingress_block_set(struct Qdisc *sch, u32 block_index)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ q->ingress_block_info.block_index = block_index;
+}
+
+static void clsact_egress_block_set(struct Qdisc *sch, u32 block_index)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ q->egress_block_info.block_index = block_index;
+}
+
+static u32 clsact_ingress_block_get(struct Qdisc *sch)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ return q->ingress_block_info.block_index;
+}
+
+static u32 clsact_egress_block_get(struct Qdisc *sch)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ return q->egress_block_info.block_index;
+}
+
+static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err;
+
+ net_inc_ingress_queue();
+ net_inc_egress_queue();
+
+ mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
+
+ q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->ingress_block_info.chain_head_change = clsact_chain_head_change;
+ q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
+
+ err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info,
+ extack);
+ if (err)
+ return err;
+
+ mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+
+ q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.chain_head_change = clsact_chain_head_change;
+ q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
+
+ return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info, extack);
+}
+
+static void clsact_destroy(struct Qdisc *sch)
+{
+ struct clsact_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+ tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
+
+ net_dec_ingress_queue();
+ net_dec_egress_queue();
+}
+
+static const struct Qdisc_class_ops clsact_class_ops = {
+ .leaf = ingress_leaf,
+ .find = clsact_find,
+ .walk = ingress_walk,
+ .tcf_block = clsact_tcf_block,
+ .bind_tcf = clsact_bind_filter,
+ .unbind_tcf = ingress_unbind_filter,
+};
+
+static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
+ .cl_ops = &clsact_class_ops,
+ .id = "clsact",
+ .priv_size = sizeof(struct clsact_sched_data),
+ .static_flags = TCQ_F_CPUSTATS,
+ .init = clsact_init,
+ .destroy = clsact_destroy,
+ .dump = ingress_dump,
+ .ingress_block_set = clsact_ingress_block_set,
+ .egress_block_set = clsact_egress_block_set,
+ .ingress_block_get = clsact_ingress_block_get,
+ .egress_block_get = clsact_egress_block_get,
+ .owner = THIS_MODULE,
+};
+
+static int __init ingress_module_init(void)
+{
+ int ret;
+
+ ret = register_qdisc(&ingress_qdisc_ops);
+ if (!ret) {
+ ret = register_qdisc(&clsact_qdisc_ops);
+ if (ret)
+ unregister_qdisc(&ingress_qdisc_ops);
+ }
+
+ return ret;
+}
+
+static void __exit ingress_module_exit(void)
+{
+ unregister_qdisc(&ingress_qdisc_ops);
+ unregister_qdisc(&clsact_qdisc_ops);
+}
+
+module_init(ingress_module_init);
+module_exit(ingress_module_exit);
+
+MODULE_ALIAS("sch_clsact");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
new file mode 100644
index 000000000..0ab13a495
--- /dev/null
+++ b/net/sched/sch_mq.c
@@ -0,0 +1,314 @@
+/*
+ * net/sched/sch_mq.c Classful multiqueue dummy scheduler
+ *
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mq_sched {
+ struct Qdisc **qdiscs;
+};
+
+static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mq_qopt_offload opt = {
+ .command = cmd,
+ .handle = sch->handle,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
+static void mq_offload_stats(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mq_qopt_offload opt = {
+ .command = TC_MQ_STATS,
+ .handle = sch->handle,
+ .stats = {
+ .bstats = &sch->bstats,
+ .qstats = &sch->qstats,
+ },
+ };
+
+ if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
+static void mq_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ mq_offload(sch, TC_MQ_DESTROY);
+
+ if (!priv->qdiscs)
+ return;
+ for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+ qdisc_put(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+}
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ /* pre-allocate qdiscs, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (!priv->qdiscs)
+ return -ENOMEM;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ dev_queue = netdev_get_tx_queue(dev, ntx);
+ qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(ntx + 1)),
+ extack);
+ if (!qdisc)
+ return -ENOMEM;
+ priv->qdiscs[ntx] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ }
+
+ sch->flags |= TCQ_F_MQROOT;
+
+ mq_offload(sch, TC_MQ_CREATE);
+ return 0;
+}
+
+static void mq_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc, *old;
+ unsigned int ntx;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_put(old);
+#ifdef CONFIG_NET_SCHED
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_hash_add(qdisc, false);
+#endif
+
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
+{
+#ifdef CONFIG_NET_SCHED
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ /* Only update the default qdiscs we created,
+ * qdiscs with handles are always hashed.
+ */
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_del(qdisc);
+ }
+ for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_add(qdisc, false);
+ }
+#endif
+}
+
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+ __u32 qlen = 0;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ /* MQ supports lockless qdiscs. However, statistics accounting needs
+ * to account for all, none, or a mix of locked and unlocked child
+ * qdiscs. Percpu stats are added to counters in-band and locking
+ * qdisc totals are added at end.
+ */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ if (qdisc_is_percpu_stats(qdisc)) {
+ qlen = qdisc_qlen_sum(qdisc);
+ __gnet_stats_copy_basic(NULL, &sch->bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&sch->qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
+ } else {
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.qlen += qdisc->qstats.qlen;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ }
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ mq_offload_stats(sch);
+
+ return 0;
+}
+
+static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1;
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
+{
+ return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
+static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+ return 0;
+}
+
+static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mq_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (!mq_queue_get(sch, ntx))
+ return 0;
+ return ntx;
+}
+
+static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ return 0;
+}
+
+static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+ &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx;
+
+ if (arg->stop)
+ return;
+
+ arg->count = arg->skip;
+ for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mq_class_ops = {
+ .select_queue = mq_select_queue,
+ .graft = mq_graft,
+ .leaf = mq_leaf,
+ .find = mq_find,
+ .walk = mq_walk,
+ .dump = mq_dump_class,
+ .dump_stats = mq_dump_class_stats,
+};
+
+struct Qdisc_ops mq_qdisc_ops __read_mostly = {
+ .cl_ops = &mq_class_ops,
+ .id = "mq",
+ .priv_size = sizeof(struct mq_sched),
+ .init = mq_init,
+ .destroy = mq_destroy,
+ .attach = mq_attach,
+ .change_real_num_tx = mq_change_real_num_tx,
+ .dump = mq_dump,
+ .owner = THIS_MODULE,
+};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 000000000..64d7f876d
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,675 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+
+struct mqprio_sched {
+ struct Qdisc **qdiscs;
+ u16 mode;
+ u16 shaper;
+ int hw_offload;
+ u32 flags;
+ u64 min_rate[TC_QOPT_MAX_QUEUE];
+ u64 max_rate[TC_QOPT_MAX_QUEUE];
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_put(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
+
+ if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
+ struct tc_mqprio_qopt_offload mqprio = { { 0 } };
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ case TC_MQPRIO_MODE_CHANNEL:
+ dev->netdev_ops->ndo_setup_tc(dev,
+ TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ break;
+ default:
+ return;
+ }
+ } else {
+ netdev_set_num_tc(dev, 0);
+ }
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int i, j;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc)
+ return -EINVAL;
+ }
+
+ /* Limit qopt->hw to maximum supported offload value. Drivers have
+ * the option of overriding this later if they don't support the a
+ * given offload type.
+ */
+ if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX)
+ qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX;
+
+ /* If hardware offload is requested we will leave it to the device
+ * to either populate the queue counts itself or to validate the
+ * provided queue counts. If ndo_setup_tc is not present then
+ * hardware doesn't support offload and we should return an error.
+ */
+ if (qopt->hw)
+ return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j])
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
+ [TCA_MQPRIO_MODE] = { .len = sizeof(u16) },
+ [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) },
+ [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED },
+ [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy, int len)
+{
+ int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+ if (nested_len >= nla_attr_size(0))
+ return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy, NULL);
+
+ memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+ return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ int i, err = -EOPNOTSUPP;
+ struct tc_mqprio_qopt *qopt = NULL;
+ struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+ struct nlattr *attr;
+ int rem;
+ int len;
+
+ BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+ BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ /* make certain can allocate enough classids to handle queues */
+ if (dev->num_tx_queues >= TC_H_MIN_PRIORITY)
+ return -ENOMEM;
+
+ if (!opt || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ if (mqprio_parse_opt(dev, qopt))
+ return -EINVAL;
+
+ len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
+ if (len > 0) {
+ err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
+ sizeof(*qopt));
+ if (err < 0)
+ return err;
+
+ if (!qopt->hw)
+ return -EINVAL;
+
+ if (tb[TCA_MQPRIO_MODE]) {
+ priv->flags |= TC_MQPRIO_F_MODE;
+ priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
+ }
+
+ if (tb[TCA_MQPRIO_SHAPER]) {
+ priv->flags |= TC_MQPRIO_F_SHAPER;
+ priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
+ }
+
+ if (tb[TCA_MQPRIO_MIN_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->min_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MIN_RATE;
+ }
+
+ if (tb[TCA_MQPRIO_MAX_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->max_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MAX_RATE;
+ }
+ }
+
+ /* pre-allocate qdisc, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (!priv->qdiscs)
+ return -ENOMEM;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue,
+ get_default_qdisc_ops(dev, i),
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)), extack);
+ if (!qdisc)
+ return -ENOMEM;
+
+ priv->qdiscs[i] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ }
+
+ /* If the mqprio options indicate that hardware should own
+ * the queue mapping then run ndo_setup_tc otherwise use the
+ * supplied and verified mapping
+ */
+ if (qopt->hw) {
+ struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+ return -EINVAL;
+ break;
+ case TC_MQPRIO_MODE_CHANNEL:
+ mqprio.flags = priv->flags;
+ if (priv->flags & TC_MQPRIO_F_MODE)
+ mqprio.mode = priv->mode;
+ if (priv->flags & TC_MQPRIO_F_SHAPER)
+ mqprio.shaper = priv->shaper;
+ if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.min_rate[i] = priv->min_rate[i];
+ if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.max_rate[i] = priv->max_rate[i];
+ break;
+ default:
+ return -EINVAL;
+ }
+ err = dev->netdev_ops->ndo_setup_tc(dev,
+ TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ if (err)
+ return err;
+
+ priv->hw_offload = mqprio.qopt.hw;
+ } else {
+ netdev_set_num_tc(dev, qopt->num_tc);
+ for (i = 0; i < qopt->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ qopt->count[i], qopt->offset[i]);
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc, *old;
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_put(old);
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_hash_add(qdisc, false);
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static void mqprio_change_real_num_tx(struct Qdisc *sch,
+ unsigned int new_real_tx)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ /* Only update the default qdiscs we created,
+ * qdiscs with handles are always hashed.
+ */
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_del(qdisc);
+ }
+ for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_add(qdisc, false);
+ }
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1;
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int dump_rates(struct mqprio_sched *priv,
+ struct tc_mqprio_qopt *opt, struct sk_buff *skb)
+{
+ struct nlattr *nest;
+ int i;
+
+ if (priv->flags & TC_MQPRIO_F_MIN_RATE) {
+ nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < opt->num_tc; i++) {
+ if (nla_put(skb, TCA_MQPRIO_MIN_RATE64,
+ sizeof(priv->min_rate[i]),
+ &priv->min_rate[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+
+ if (priv->flags & TC_MQPRIO_F_MAX_RATE) {
+ nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < opt->num_tc; i++) {
+ if (nla_put(skb, TCA_MQPRIO_MAX_RATE64,
+ sizeof(priv->max_rate[i]),
+ &priv->max_rate[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct Qdisc *qdisc;
+ unsigned int ntx, tc;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ /* MQ supports lockless qdiscs. However, statistics accounting needs
+ * to account for all, none, or a mix of locked and unlocked child
+ * qdiscs. Percpu stats are added to counters in-band and locking
+ * qdisc totals are added at end.
+ */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ if (qdisc_is_percpu_stats(qdisc)) {
+ __u32 qlen = qdisc_qlen_sum(qdisc);
+
+ __gnet_stats_copy_basic(NULL, &sch->bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&sch->qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
+ } else {
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ }
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ opt.hw = priv->hw_offload;
+
+ for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
+ opt.count[tc] = dev->tc_to_txq[tc].count;
+ opt.offset[tc] = dev->tc_to_txq[tc].offset;
+ }
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if ((priv->flags & TC_MQPRIO_F_MODE) &&
+ nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode))
+ goto nla_put_failure;
+
+ if ((priv->flags & TC_MQPRIO_F_SHAPER) &&
+ nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper))
+ goto nla_put_failure;
+
+ if ((priv->flags & TC_MQPRIO_F_MIN_RATE ||
+ priv->flags & TC_MQPRIO_F_MAX_RATE) &&
+ (dump_rates(priv, &opt, skb) != 0))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nla);
+nla_put_failure:
+ nlmsg_trim(skb, nla);
+ return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = TC_H_MIN(classid);
+
+ /* There are essentially two regions here that have valid classid
+ * values. The first region will have a classid value of 1 through
+ * num_tx_queues. All of these are backed by actual Qdiscs.
+ */
+ if (ntx < TC_H_MIN_PRIORITY)
+ return (ntx <= dev->num_tx_queues) ? ntx : 0;
+
+ /* The second region represents the hardware traffic classes. These
+ * are represented by classid values of TC_H_MIN_PRIORITY through
+ * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1
+ */
+ return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0;
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ if (cl < TC_H_MIN_PRIORITY) {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+ struct net_device *dev = qdisc_dev(sch);
+ int tc = netdev_txq_to_tc(dev, cl - 1);
+
+ tcm->tcm_parent = (tc < 0) ? 0 :
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(tc + TC_H_MIN_PRIORITY));
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ } else {
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_info = 0;
+ }
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ if (cl >= TC_H_MIN_PRIORITY) {
+ int i;
+ __u32 qlen = 0;
+ struct gnet_stats_queue qstats = {0};
+ struct gnet_stats_basic_packed bstats = {0};
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK];
+
+ /* Drop lock here it will be reclaimed before touching
+ * statistics this is required because the d->lock we
+ * hold here is the look on dev_queue->qdisc_sleeping
+ * also acquired below.
+ */
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+
+ for (i = tc.offset; i < tc.offset + tc.count; i++) {
+ struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+ struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
+
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ if (qdisc_is_percpu_stats(qdisc)) {
+ qlen = qdisc_qlen_sum(qdisc);
+
+ __gnet_stats_copy_basic(NULL, &bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats,
+ qlen);
+ } else {
+ qlen += qdisc->q.qlen;
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ }
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ /* Reclaim root sleeping lock before completing stats */
+ if (d->lock)
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
+ return -1;
+ } else {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+ sch->cpu_bstats, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL,
+ &sch->qstats, sch->q.qlen) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ /* Walk hierarchy with a virtual class per tc */
+ arg->count = arg->skip;
+ for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) {
+ if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+
+ /* Pad the values and skip over unused traffic classes */
+ if (ntx < TC_MAX_QUEUE) {
+ arg->count = TC_MAX_QUEUE;
+ ntx = TC_MAX_QUEUE;
+ }
+
+ /* Reset offset, sort out remaining per-queue qdiscs */
+ for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
+{
+ return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+ .graft = mqprio_graft,
+ .leaf = mqprio_leaf,
+ .find = mqprio_find,
+ .walk = mqprio_walk,
+ .dump = mqprio_dump_class,
+ .dump_stats = mqprio_dump_class_stats,
+ .select_queue = mqprio_select_queue,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+ .cl_ops = &mqprio_class_ops,
+ .id = "mqprio",
+ .priv_size = sizeof(struct mqprio_sched),
+ .init = mqprio_init,
+ .destroy = mqprio_destroy,
+ .attach = mqprio_attach,
+ .change_real_num_tx = mqprio_change_real_num_tx,
+ .dump = mqprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+ return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+ unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 000000000..1c2f9a3ab
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct multiq_sched_data {
+ u16 bands;
+ u16 max_bands;
+ u16 curband;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ u32 band;
+ struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ err = tcf_classify(skb, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ band = skb_get_queue_mapping(skb);
+
+ if (band >= q->bands)
+ return q->queues[0];
+
+ return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+#endif
+
+ ret = qdisc_enqueue(skb, qdisc, to_free);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+}
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ struct sk_buff *skb;
+ int band;
+
+ for (band = 0; band < q->bands; band++) {
+ /* cycle through bands to ensure fairness */
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+
+ /* Check that target subqueue is available before
+ * pulling an skb to avoid head-of-line blocking.
+ */
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
+ qdisc = q->queues[q->curband];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ }
+ return NULL;
+
+}
+
+static struct sk_buff *multiq_peek(struct Qdisc *sch)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned int curband = q->curband;
+ struct Qdisc *qdisc;
+ struct sk_buff *skb;
+ int band;
+
+ for (band = 0; band < q->bands; band++) {
+ /* cycle through bands to ensure fairness */
+ curband++;
+ if (curband >= q->bands)
+ curband = 0;
+
+ /* Check that target subqueue is available before
+ * pulling an skb to avoid head-of-line blocking.
+ */
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), curband))) {
+ qdisc = q->queues[curband];
+ skb = qdisc->ops->peek(qdisc);
+ if (skb)
+ return skb;
+ }
+ }
+ return NULL;
+
+}
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+ u16 band;
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ for (band = 0; band < q->bands; band++)
+ qdisc_reset(q->queues[band]);
+ sch->q.qlen = 0;
+ q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+ int band;
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ for (band = 0; band < q->bands; band++)
+ qdisc_put(q->queues[band]);
+
+ kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct tc_multiq_qopt *qopt;
+ int i;
+
+ if (!netif_is_multiqueue(qdisc_dev(sch)))
+ return -EOPNOTSUPP;
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+
+ qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ for (i = q->bands; i < q->max_bands; i++) {
+ if (q->queues[i] != &noop_qdisc) {
+ struct Qdisc *child = q->queues[i];
+ q->queues[i] = &noop_qdisc;
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
+ qdisc_put(child);
+ }
+ }
+
+ sch_tree_unlock(sch);
+
+ for (i = 0; i < q->bands; i++) {
+ if (q->queues[i] == &noop_qdisc) {
+ struct Qdisc *child, *old;
+ child = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle,
+ i + 1), extack);
+ if (child) {
+ sch_tree_lock(sch);
+ old = q->queues[i];
+ q->queues[i] = child;
+ if (child != &noop_qdisc)
+ qdisc_hash_add(child, true);
+
+ if (old != &noop_qdisc) {
+ qdisc_tree_reduce_backlog(old,
+ old->q.qlen,
+ old->qstats.backlog);
+ qdisc_put(old);
+ }
+ sch_tree_unlock(sch);
+ }
+ }
+ }
+ return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ int i, err;
+
+ q->queues = NULL;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+ q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+ if (!q->queues)
+ return -ENOBUFS;
+ for (i = 0; i < q->max_bands; i++)
+ q->queues[i] = &noop_qdisc;
+
+ return multiq_tune(sch, opt, extack);
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_multiq_qopt opt;
+
+ opt.bands = q->bands;
+ opt.max_bands = q->max_bands;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ *old = qdisc_replace(sch, new, &q->queues[band]);
+ return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ return q->queues[band];
+}
+
+static unsigned long multiq_find(struct Qdisc *sch, u32 classid)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return multiq_find(sch, classid);
+}
+
+
+static void multiq_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = q->queues[cl - 1]->handle;
+ return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ int band;
+
+ if (arg->stop)
+ return;
+
+ for (band = 0; band < q->bands; band++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, band + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+ .graft = multiq_graft,
+ .leaf = multiq_leaf,
+ .find = multiq_find,
+ .walk = multiq_walk,
+ .tcf_block = multiq_tcf_block,
+ .bind_tcf = multiq_bind,
+ .unbind_tcf = multiq_unbind,
+ .dump = multiq_dump_class,
+ .dump_stats = multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &multiq_class_ops,
+ .id = "multiq",
+ .priv_size = sizeof(struct multiq_sched_data),
+ .enqueue = multiq_enqueue,
+ .dequeue = multiq_dequeue,
+ .peek = multiq_peek,
+ .init = multiq_init,
+ .reset = multiq_reset,
+ .destroy = multiq_destroy,
+ .change = multiq_tune,
+ .dump = multiq_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+ return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+ unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
new file mode 100644
index 000000000..ad400f4f9
--- /dev/null
+++ b/net/sched/sch_netem.c
@@ -0,0 +1,1275 @@
+/*
+ * net/sched/sch_netem.c Network emulator
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License.
+ *
+ * Many of the algorithms and ideas for this came from
+ * NIST Net which is not copyrighted.
+ *
+ * Authors: Stephen Hemminger <shemminger@osdl.org>
+ * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define VERSION "1.3"
+
+/* Network Emulation Queuing algorithm.
+ ====================================
+
+ Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
+ Network Emulation Tool
+ [2] Luigi Rizzo, DummyNet for FreeBSD
+
+ ----------------------------------------------------------------
+
+ This started out as a simple way to delay outgoing packets to
+ test TCP but has grown to include most of the functionality
+ of a full blown network emulator like NISTnet. It can delay
+ packets and add random jitter (and correlation). The random
+ distribution can be loaded from a table as well to provide
+ normal, Pareto, or experimental curves. Packet loss,
+ duplication, and reordering can also be emulated.
+
+ This qdisc does not do classification that can be handled in
+ layering other disciplines. It does not need to do bandwidth
+ control either since that can be handled by using token
+ bucket or other rate control.
+
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
+*/
+
+struct disttable {
+ u32 size;
+ s16 table[0];
+};
+
+struct netem_sched_data {
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
+
+ /* optional qdisc for classful handling (NULL at netem init) */
+ struct Qdisc *qdisc;
+
+ struct qdisc_watchdog watchdog;
+
+ s64 latency;
+ s64 jitter;
+
+ u32 loss;
+ u32 ecn;
+ u32 limit;
+ u32 counter;
+ u32 gap;
+ u32 duplicate;
+ u32 reorder;
+ u32 corrupt;
+ u64 rate;
+ s32 packet_overhead;
+ u32 cell_size;
+ struct reciprocal_value cell_size_reciprocal;
+ s32 cell_overhead;
+
+ struct crndstate {
+ u32 last;
+ u32 rho;
+ } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
+
+ struct disttable *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ enum {
+ TX_IN_GAP_PERIOD = 1,
+ TX_IN_BURST_PERIOD,
+ LOST_IN_GAP_PERIOD,
+ LOST_IN_BURST_PERIOD,
+ } _4_state_model;
+
+ enum {
+ GOOD_STATE = 1,
+ BAD_STATE,
+ } GE_state_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
+ struct tc_netem_slot slot_config;
+ struct slotstate {
+ u64 slot_next;
+ s32 packets_left;
+ s32 bytes_left;
+ } slot;
+
+ struct disttable *slot_dist;
+};
+
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ *
+ * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
+ * and skb->next & skb->prev are scratch space for a qdisc,
+ * we save skb->tstamp value in skb->cb[] before destroying it.
+ */
+struct netem_skb_cb {
+ u64 time_to_send;
+};
+
+static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
+{
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
+ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
+ return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/* init_crandom - initialize correlated random number generator
+ * Use entropy source for initial seed.
+ */
+static void init_crandom(struct crndstate *state, unsigned long rho)
+{
+ state->rho = rho;
+ state->last = prandom_u32();
+}
+
+/* get_crandom - correlated random number generator
+ * Next number depends on last value.
+ * rho is scaled to avoid floating point.
+ */
+static u32 get_crandom(struct crndstate *state)
+{
+ u64 value, rho;
+ unsigned long answer;
+
+ if (!state || state->rho == 0) /* no correlation */
+ return prandom_u32();
+
+ value = prandom_u32();
+ rho = (u64)state->rho + 1;
+ answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
+ state->last = answer;
+ return answer;
+}
+
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = prandom_u32();
+
+ /*
+ * Makes a comparison between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+ * LOST_IN_BURST_PERIOD => isolated losses within a gap period
+ * LOST_IN_GAP_PERIOD => lost packets within a burst period
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case TX_IN_GAP_PERIOD:
+ if (rnd < clg->a4) {
+ clg->state = LOST_IN_BURST_PERIOD;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else if (clg->a1 + clg->a4 < rnd) {
+ clg->state = TX_IN_GAP_PERIOD;
+ }
+
+ break;
+ case TX_IN_BURST_PERIOD:
+ if (rnd < clg->a5) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else {
+ clg->state = TX_IN_BURST_PERIOD;
+ }
+
+ break;
+ case LOST_IN_GAP_PERIOD:
+ if (rnd < clg->a3)
+ clg->state = TX_IN_BURST_PERIOD;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = TX_IN_GAP_PERIOD;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ }
+ break;
+ case LOST_IN_BURST_PERIOD:
+ clg->state = TX_IN_GAP_PERIOD;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case GOOD_STATE:
+ if (prandom_u32() < clg->a1)
+ clg->state = BAD_STATE;
+ if (prandom_u32() < clg->a4)
+ return true;
+ break;
+ case BAD_STATE:
+ if (prandom_u32() < clg->a2)
+ clg->state = GOOD_STATE;
+ if (prandom_u32() > clg->a3)
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
+/* tabledist - return a pseudo-randomly distributed value with mean mu and
+ * std deviation sigma. Uses table lookup to approximate the desired
+ * distribution, and a uniformly-distributed pseudo-random source.
+ */
+static s64 tabledist(s64 mu, s32 sigma,
+ struct crndstate *state,
+ const struct disttable *dist)
+{
+ s64 x;
+ long t;
+ u32 rnd;
+
+ if (sigma == 0)
+ return mu;
+
+ rnd = get_crandom(state);
+
+ /* default uniform distribution */
+ if (dist == NULL)
+ return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
+
+ t = dist->table[rnd % dist->size];
+ x = (sigma % NETEM_DIST_SCALE) * t;
+ if (x >= 0)
+ x += NETEM_DIST_SCALE/2;
+ else
+ x -= NETEM_DIST_SCALE/2;
+
+ return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
+}
+
+static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
+{
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+ if (len > cells * q->cell_size) /* extra cell needed for remainder */
+ cells++;
+ len = cells * (q->cell_size + q->cell_overhead);
+ }
+
+ return div64_u64(len * NSEC_PER_SEC, q->rate);
+}
+
+static void tfifo_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p = rb_first(&q->t_root);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &q->t_root);
+ rtnl_kfree_skbs(skb, skb);
+ }
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ u64 tnext = netem_skb_cb(nskb)->time_to_send;
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->t_root);
+ sch->q.qlen++;
+}
+
+/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
+ * when we statistically choose to corrupt one, we instead segment it, returning
+ * the first packet to be corrupted, and re-enqueue the remaining frames
+ */
+static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct sk_buff *segs;
+ netdev_features_t features = netif_skb_features(skb);
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs)) {
+ qdisc_drop(skb, sch, to_free);
+ return NULL;
+ }
+ consume_skb(skb);
+ return segs;
+}
+
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+ skb->next = qh->head;
+
+ if (!qh->head)
+ qh->tail = skb;
+ qh->head = skb;
+ qh->qlen++;
+}
+
+/*
+ * Insert one skb into qdisc.
+ * Note: parent depends on return value to account for queue length.
+ * NET_XMIT_DROP: queue length didn't change.
+ * NET_XMIT_SUCCESS: one skb was queued.
+ */
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ /* We don't fill cb now as skb_unshare() may invalidate it */
+ struct netem_skb_cb *cb;
+ struct sk_buff *skb2;
+ struct sk_buff *segs = NULL;
+ unsigned int prev_len = qdisc_pkt_len(skb);
+ int count = 1;
+ int rc = NET_XMIT_SUCCESS;
+ int rc_drop = NET_XMIT_DROP;
+
+ /* Do not fool qdisc_drop_all() */
+ skb->prev = NULL;
+
+ /* Random duplication */
+ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+ ++count;
+
+ /* Drop packet? */
+ if (loss_event(q)) {
+ if (q->ecn && INET_ECN_set_ce(skb))
+ qdisc_qstats_drop(sch); /* mark packet */
+ else
+ --count;
+ }
+ if (count == 0) {
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+
+ /* If a delay is expected, orphan the skb. (orphaning usually takes
+ * place at TX completion time, so _before_ the link transit delay)
+ */
+ if (q->latency || q->jitter || q->rate)
+ skb_orphan_partial(skb);
+
+ /*
+ * If we need to duplicate packet, then re-insert at top of the
+ * qdisc tree, since parent queuer expects that only one
+ * skb will be queued.
+ */
+ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+ struct Qdisc *rootq = qdisc_root_bh(sch);
+ u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+
+ q->duplicate = 0;
+ rootq->enqueue(skb2, rootq, to_free);
+ q->duplicate = dupsave;
+ rc_drop = NET_XMIT_SUCCESS;
+ }
+
+ /*
+ * Randomized packet corruption.
+ * Make copy if needed since we are modifying
+ * If packet is going to be hardware checksummed, then
+ * do it now in software before we mangle it.
+ */
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (skb_is_gso(skb)) {
+ segs = netem_segment(skb, sch, to_free);
+ if (!segs)
+ return rc_drop;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ } else {
+ segs = skb;
+ }
+
+ skb = segs;
+ segs = segs->next;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ qdisc_qstats_drop(sch);
+ goto finish_segs;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb)) {
+ qdisc_drop(skb, sch, to_free);
+ skb = NULL;
+ goto finish_segs;
+ }
+
+ skb->data[prandom_u32() % skb_headlen(skb)] ^=
+ 1<<(prandom_u32() % 8);
+ }
+
+ if (unlikely(sch->q.qlen >= sch->limit)) {
+ qdisc_drop_all(skb, sch, to_free);
+ return rc_drop;
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ cb = netem_skb_cb(skb);
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap - 1 || /* inside last reordering gap */
+ q->reorder < get_crandom(&q->reorder_cor)) {
+ u64 now;
+ s64 delay;
+
+ delay = tabledist(q->latency, q->jitter,
+ &q->delay_cor, q->delay_dist);
+
+ now = ktime_get_ns();
+
+ if (q->rate) {
+ struct netem_skb_cb *last = NULL;
+
+ if (sch->q.tail)
+ last = netem_skb_cb(sch->q.tail);
+ if (q->t_root.rb_node) {
+ struct sk_buff *t_skb;
+ struct netem_skb_cb *t_last;
+
+ t_skb = skb_rb_last(&q->t_root);
+ t_last = netem_skb_cb(t_skb);
+ if (!last ||
+ t_last->time_to_send > last->time_to_send) {
+ last = t_last;
+ }
+ }
+
+ if (last) {
+ /*
+ * Last packet in queue is reference point (now),
+ * calculate this time bonus and subtract
+ * from delay.
+ */
+ delay -= last->time_to_send - now;
+ delay = max_t(s64, 0, delay);
+ now = last->time_to_send;
+ }
+
+ delay += packet_time_ns(qdisc_pkt_len(skb), q);
+ }
+
+ cb->time_to_send = now + delay;
+ ++q->counter;
+ tfifo_enqueue(skb, sch);
+ } else {
+ /*
+ * Do re-ordering by putting one out of N packets at the front
+ * of the queue.
+ */
+ cb->time_to_send = ktime_get_ns();
+ q->counter = 0;
+
+ netem_enqueue_skb_head(&sch->q, skb);
+ sch->qstats.requeues++;
+ }
+
+finish_segs:
+ if (segs) {
+ unsigned int len, last_len;
+ int nb;
+
+ len = skb ? skb->len : 0;
+ nb = skb ? 1 : 0;
+
+ while (segs) {
+ skb2 = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ last_len = segs->len;
+ rc = qdisc_enqueue(segs, sch, to_free);
+ if (rc != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(rc))
+ qdisc_qstats_drop(sch);
+ } else {
+ nb++;
+ len += last_len;
+ }
+ segs = skb2;
+ }
+ /* Parent qdiscs accounted for 1 skb of size @prev_len */
+ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+ } else if (!skb) {
+ return NET_XMIT_DROP;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+/* Delay the next round with a new future slot with a
+ * correct number of bytes and packets.
+ */
+
+static void get_slot_next(struct netem_sched_data *q, u64 now)
+{
+ s64 next_delay;
+
+ if (!q->slot_dist)
+ next_delay = q->slot_config.min_delay +
+ (prandom_u32() *
+ (q->slot_config.max_delay -
+ q->slot_config.min_delay) >> 32);
+ else
+ next_delay = tabledist(q->slot_config.dist_delay,
+ (s32)(q->slot_config.dist_jitter),
+ NULL, q->slot_dist);
+
+ q->slot.slot_next = now + next_delay;
+ q->slot.packets_left = q->slot_config.max_packets;
+ q->slot.bytes_left = q->slot_config.max_bytes;
+}
+
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct rb_node *p;
+
+tfifo_dequeue:
+ skb = __qdisc_dequeue_head(&sch->q);
+ if (skb) {
+ qdisc_qstats_backlog_dec(sch, skb);
+deliver:
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ u64 time_to_send;
+ u64 now = ktime_get_ns();
+
+ skb = rb_to_skb(p);
+
+ /* if more time remaining? */
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (q->slot.slot_next && q->slot.slot_next < time_to_send)
+ get_slot_next(q, now);
+
+ if (time_to_send <= now && q->slot.slot_next <= now) {
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ skb->next = NULL;
+ skb->prev = NULL;
+ /* skb->dev shares skb->rbnode area,
+ * we need to restore its value.
+ */
+ skb->dev = qdisc_dev(sch);
+
+#ifdef CONFIG_NET_CLS_ACT
+ /*
+ * If it's at ingress let's pretend the delay is
+ * from the network (tstamp will be updated).
+ */
+ if (skb->tc_redirected && skb->tc_from_ingress)
+ skb->tstamp = 0;
+#endif
+
+ if (q->slot.slot_next) {
+ q->slot.packets_left--;
+ q->slot.bytes_left -= qdisc_pkt_len(skb);
+ if (q->slot.packets_left <= 0 ||
+ q->slot.bytes_left <= 0)
+ get_slot_next(q, now);
+ }
+
+ if (q->qdisc) {
+ unsigned int pkt_len = qdisc_pkt_len(skb);
+ struct sk_buff *to_free = NULL;
+ int err;
+
+ err = qdisc_enqueue(skb, q->qdisc, &to_free);
+ kfree_skb_list(to_free);
+ if (err != NET_XMIT_SUCCESS &&
+ net_xmit_drop_count(err)) {
+ qdisc_qstats_drop(sch);
+ qdisc_tree_reduce_backlog(sch, 1,
+ pkt_len);
+ }
+ goto tfifo_dequeue;
+ }
+ goto deliver;
+ }
+
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ max(time_to_send,
+ q->slot.slot_next));
+ }
+
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+ return NULL;
+}
+
+static void netem_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ tfifo_reset(sch);
+ if (q->qdisc)
+ qdisc_reset(q->qdisc);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static void dist_free(struct disttable *d)
+{
+ kvfree(d);
+}
+
+/*
+ * Distribution data is a variable size payload containing
+ * signed 16 bit values.
+ */
+
+static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
+ const struct nlattr *attr)
+{
+ size_t n = nla_len(attr)/sizeof(__s16);
+ const __s16 *data = nla_data(attr);
+ spinlock_t *root_lock;
+ struct disttable *d;
+ int i;
+
+ if (!n || n > NETEM_DIST_MAX)
+ return -EINVAL;
+
+ d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->size = n;
+ for (i = 0; i < n; i++)
+ d->table[i] = data[i];
+
+ root_lock = qdisc_root_sleeping_lock(sch);
+
+ spin_lock_bh(root_lock);
+ swap(*tbl, d);
+ spin_unlock_bh(root_lock);
+
+ dist_free(d);
+ return 0;
+}
+
+static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_slot *c = nla_data(attr);
+
+ q->slot_config = *c;
+ if (q->slot_config.max_packets == 0)
+ q->slot_config.max_packets = INT_MAX;
+ if (q->slot_config.max_bytes == 0)
+ q->slot_config.max_bytes = INT_MAX;
+
+ /* capping dist_jitter to the range acceptable by tabledist() */
+ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
+
+ q->slot.packets_left = q->slot_config.max_packets;
+ q->slot.bytes_left = q->slot_config.max_bytes;
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter)
+ q->slot.slot_next = ktime_get_ns();
+ else
+ q->slot.slot_next = 0;
+}
+
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_corr *c = nla_data(attr);
+
+ init_crandom(&q->delay_cor, c->delay_corr);
+ init_crandom(&q->loss_cor, c->loss_corr);
+ init_crandom(&q->dup_cor, c->dup_corr);
+}
+
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_reorder *r = nla_data(attr);
+
+ q->reorder = r->probability;
+ init_crandom(&q->reorder_cor, r->correlation);
+}
+
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_corrupt *r = nla_data(attr);
+
+ q->corrupt = r->probability;
+ init_crandom(&q->corrupt_cor, r->correlation);
+}
+
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_rate *r = nla_data(attr);
+
+ q->rate = r->rate;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ q->cell_overhead = r->cell_overhead;
+ if (q->cell_size)
+ q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+ else
+ q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
+}
+
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch (type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = TX_IN_GAP_PERIOD;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect ge model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = GOOD_STATE;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
+ [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
+ [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
+ [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
+ [TCA_NETEM_ECN] = { .type = NLA_U32 },
+ [TCA_NETEM_RATE64] = { .type = NLA_U64 },
+ [TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
+ [TCA_NETEM_JITTER64] = { .type = NLA_S64 },
+ [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy, int len)
+{
+ int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
+ return -EINVAL;
+ }
+
+ if (nested_len >= nla_attr_size(0))
+ return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy, NULL);
+
+ memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+ return 0;
+}
+
+/* Parse netlink message to set options */
+static int netem_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_NETEM_MAX + 1];
+ struct tc_netem_qopt *qopt;
+ struct clgstate old_clg;
+ int old_loss_model = CLG_RANDOM;
+ int ret;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
+ if (ret < 0)
+ return ret;
+
+ /* backup q->clg and q->loss_model */
+ old_clg = q->clg;
+ old_loss_model = q->loss_model;
+
+ if (tb[TCA_NETEM_LOSS]) {
+ ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+ if (ret) {
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ } else {
+ q->loss_model = CLG_RANDOM;
+ }
+
+ if (tb[TCA_NETEM_DELAY_DIST]) {
+ ret = get_dist_table(sch, &q->delay_dist,
+ tb[TCA_NETEM_DELAY_DIST]);
+ if (ret)
+ goto get_table_failure;
+ }
+
+ if (tb[TCA_NETEM_SLOT_DIST]) {
+ ret = get_dist_table(sch, &q->slot_dist,
+ tb[TCA_NETEM_SLOT_DIST]);
+ if (ret)
+ goto get_table_failure;
+ }
+
+ sch->limit = qopt->limit;
+
+ q->latency = PSCHED_TICKS2NS(qopt->latency);
+ q->jitter = PSCHED_TICKS2NS(qopt->jitter);
+ q->limit = qopt->limit;
+ q->gap = qopt->gap;
+ q->counter = 0;
+ q->loss = qopt->loss;
+ q->duplicate = qopt->duplicate;
+
+ /* for compatibility with earlier versions.
+ * if gap is set, need to assume 100% probability
+ */
+ if (q->gap)
+ q->reorder = ~0;
+
+ if (tb[TCA_NETEM_CORR])
+ get_correlation(q, tb[TCA_NETEM_CORR]);
+
+ if (tb[TCA_NETEM_REORDER])
+ get_reorder(q, tb[TCA_NETEM_REORDER]);
+
+ if (tb[TCA_NETEM_CORRUPT])
+ get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
+
+ if (tb[TCA_NETEM_RATE])
+ get_rate(q, tb[TCA_NETEM_RATE]);
+
+ if (tb[TCA_NETEM_RATE64])
+ q->rate = max_t(u64, q->rate,
+ nla_get_u64(tb[TCA_NETEM_RATE64]));
+
+ if (tb[TCA_NETEM_LATENCY64])
+ q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
+
+ if (tb[TCA_NETEM_JITTER64])
+ q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
+
+ if (tb[TCA_NETEM_ECN])
+ q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
+
+ if (tb[TCA_NETEM_SLOT])
+ get_slot(q, tb[TCA_NETEM_SLOT]);
+
+ /* capping jitter to the range acceptable by tabledist() */
+ q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
+
+ return ret;
+
+get_table_failure:
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
+}
+
+static int netem_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ int ret;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (!opt)
+ return -EINVAL;
+
+ q->loss_model = CLG_RANDOM;
+ ret = netem_change(sch, opt, extack);
+ if (ret)
+ pr_info("netem: change failed\n");
+ return ret;
+}
+
+static void netem_destroy(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ if (q->qdisc)
+ qdisc_put(q->qdisc);
+ dist_free(q->delay_dist);
+ dist_free(q->slot_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+ goto nla_put_failure;
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+ goto nla_put_failure;
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ const struct netem_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
+ struct tc_netem_qopt qopt;
+ struct tc_netem_corr cor;
+ struct tc_netem_reorder reorder;
+ struct tc_netem_corrupt corrupt;
+ struct tc_netem_rate rate;
+ struct tc_netem_slot slot;
+
+ qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
+ UINT_MAX);
+ qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
+ UINT_MAX);
+ qopt.limit = q->limit;
+ qopt.loss = q->loss;
+ qopt.gap = q->gap;
+ qopt.duplicate = q->duplicate;
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
+ goto nla_put_failure;
+
+ cor.delay_corr = q->delay_cor.rho;
+ cor.loss_corr = q->loss_cor.rho;
+ cor.dup_corr = q->dup_cor.rho;
+ if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+ goto nla_put_failure;
+
+ reorder.probability = q->reorder;
+ reorder.correlation = q->reorder_cor.rho;
+ if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+ goto nla_put_failure;
+
+ corrupt.probability = q->corrupt;
+ corrupt.correlation = q->corrupt_cor.rho;
+ if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+ goto nla_put_failure;
+
+ if (q->rate >= (1ULL << 32)) {
+ if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
+ TCA_NETEM_PAD))
+ goto nla_put_failure;
+ rate.rate = ~0U;
+ } else {
+ rate.rate = q->rate;
+ }
+ rate.packet_overhead = q->packet_overhead;
+ rate.cell_size = q->cell_size;
+ rate.cell_overhead = q->cell_overhead;
+ if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+ goto nla_put_failure;
+
+ if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+ goto nla_put_failure;
+
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
+
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter) {
+ slot = q->slot_config;
+ if (slot.max_packets == INT_MAX)
+ slot.max_packets = 0;
+ if (slot.max_bytes == INT_MAX)
+ slot.max_bytes = 0;
+ if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
+ goto nla_put_failure;
+ }
+
+ return nla_nest_end(skb, nla);
+
+nla_put_failure:
+ nlmsg_trim(skb, nla);
+ return -1;
+}
+
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long netem_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+ .graft = netem_graft,
+ .leaf = netem_leaf,
+ .find = netem_find,
+ .walk = netem_walk,
+ .dump = netem_dump_class,
+};
+
+static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
+ .id = "netem",
+ .cl_ops = &netem_class_ops,
+ .priv_size = sizeof(struct netem_sched_data),
+ .enqueue = netem_enqueue,
+ .dequeue = netem_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = netem_init,
+ .reset = netem_reset,
+ .destroy = netem_destroy,
+ .change = netem_change,
+ .dump = netem_dump,
+ .owner = THIS_MODULE,
+};
+
+
+static int __init netem_module_init(void)
+{
+ pr_info("netem: version " VERSION "\n");
+ return register_qdisc(&netem_qdisc_ops);
+}
+static void __exit netem_module_exit(void)
+{
+ unregister_qdisc(&netem_qdisc_ops);
+}
+module_init(netem_module_init)
+module_exit(netem_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
new file mode 100644
index 000000000..18d30bb86
--- /dev/null
+++ b/net/sched/sch_pie.c
@@ -0,0 +1,572 @@
+/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Vijay Subramanian <vijaynsu@cisco.com>
+ * Author: Mythili Prabhu <mysuryan@cisco.com>
+ *
+ * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
+ * University of Oslo, Norway.
+ *
+ * References:
+ * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
+ * IEEE Conference on High Performance Switching and Routing 2013 :
+ * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define QUEUE_THRESHOLD 10000
+#define DQCOUNT_INVALID -1
+#define MAX_PROB 0xffffffff
+#define PIE_SCALE 8
+
+/* parameters used */
+struct pie_params {
+ psched_time_t target; /* user specified target delay in pschedtime */
+ u32 tupdate; /* timer frequency (in jiffies) */
+ u32 limit; /* number of packets that can be enqueued */
+ u32 alpha; /* alpha and beta are between 0 and 32 */
+ u32 beta; /* and are used for shift relative to 1 */
+ bool ecn; /* true if ecn is enabled */
+ bool bytemode; /* to scale drop early prob based on pkt size */
+};
+
+/* variables used */
+struct pie_vars {
+ u32 prob; /* probability but scaled by u32 limit. */
+ psched_time_t burst_time;
+ psched_time_t qdelay;
+ psched_time_t qdelay_old;
+ u64 dq_count; /* measured in bytes */
+ psched_time_t dq_tstamp; /* drain rate */
+ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
+ u32 qlen_old; /* in bytes */
+};
+
+/* statistics gathering */
+struct pie_stats {
+ u32 packets_in; /* total number of packets enqueued */
+ u32 dropped; /* packets dropped due to pie_action */
+ u32 overlimit; /* dropped due to lack of space in queue */
+ u32 maxq; /* maximum queue size */
+ u32 ecn_mark; /* packets marked with ECN */
+};
+
+/* private data for the Qdisc */
+struct pie_sched_data {
+ struct pie_params params;
+ struct pie_vars vars;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+ struct Qdisc *sch;
+};
+
+static void pie_params_init(struct pie_params *params)
+{
+ params->alpha = 2;
+ params->beta = 20;
+ params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
+ params->limit = 1000; /* default of 1000 packets */
+ params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
+ params->ecn = false;
+ params->bytemode = false;
+}
+
+static void pie_vars_init(struct pie_vars *vars)
+{
+ vars->dq_count = DQCOUNT_INVALID;
+ vars->avg_dq_rate = 0;
+ /* default of 100 ms in pschedtime */
+ vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+}
+
+static bool drop_early(struct Qdisc *sch, u32 packet_size)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 rnd;
+ u32 local_prob = q->vars.prob;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+
+ /* If there is still burst allowance left skip random early drop */
+ if (q->vars.burst_time > 0)
+ return false;
+
+ /* If current delay is less than half of target, and
+ * if drop prob is low already, disable early_drop
+ */
+ if ((q->vars.qdelay < q->params.target / 2)
+ && (q->vars.prob < MAX_PROB / 5))
+ return false;
+
+ /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ * similar to min_th in RED
+ */
+ if (sch->qstats.backlog < 2 * mtu)
+ return false;
+
+ /* If bytemode is turned on, use packet size to compute new
+ * probablity. Smaller packets will have lower drop prob in this case
+ */
+ if (q->params.bytemode && packet_size <= mtu)
+ local_prob = (local_prob / mtu) * packet_size;
+ else
+ local_prob = q->vars.prob;
+
+ rnd = prandom_u32();
+ if (rnd < local_prob)
+ return true;
+
+ return false;
+}
+
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ bool enqueue = false;
+
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ }
+
+ if (!drop_early(sch, skb->len)) {
+ enqueue = true;
+ } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than 10%, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+
+ /* we can enqueue the packet */
+ if (enqueue) {
+ q->stats.packets_in++;
+ if (qdisc_qlen(sch) > q->stats.maxq)
+ q->stats.maxq = qdisc_qlen(sch);
+
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+out:
+ q->stats.dropped++;
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+};
+
+static int pie_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_PIE_MAX + 1];
+ unsigned int qlen, dropped = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_PIE_TUPDATE])
+ q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+
+ if (tb[TCA_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
+
+ q->params.limit = limit;
+ sch->limit = limit;
+ }
+
+ if (tb[TCA_PIE_ALPHA])
+ q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+
+ if (tb[TCA_PIE_BETA])
+ q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+
+ if (tb[TCA_PIE_ECN])
+ q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+
+ if (tb[TCA_PIE_BYTEMODE])
+ q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+
+ /* Drop excess packets if new limit is lower */
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
+ dropped += qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ rtnl_qdisc_drop(skb, sch);
+ }
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+{
+
+ struct pie_sched_data *q = qdisc_priv(sch);
+ int qlen = sch->qstats.backlog; /* current queue size in bytes */
+
+ /* If current queue is about 10 packets or more and dq_count is unset
+ * we have enough packets to calculate the drain rate. Save
+ * current time as dq_tstamp and start measurement cycle.
+ */
+ if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
+ q->vars.dq_tstamp = psched_get_time();
+ q->vars.dq_count = 0;
+ }
+
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ * the dq_count to -1 as we don't have enough packets to calculate the
+ * drain rate anymore The following if block is entered only when we
+ * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
+ * and we calculate the drain rate for the threshold here. dq_count is
+ * in bytes, time difference in psched_time, hence rate is in
+ * bytes/psched_time.
+ */
+ if (q->vars.dq_count != DQCOUNT_INVALID) {
+ q->vars.dq_count += skb->len;
+
+ if (q->vars.dq_count >= QUEUE_THRESHOLD) {
+ psched_time_t now = psched_get_time();
+ u32 dtime = now - q->vars.dq_tstamp;
+ u32 count = q->vars.dq_count << PIE_SCALE;
+
+ if (dtime == 0)
+ return;
+
+ count = count / dtime;
+
+ if (q->vars.avg_dq_rate == 0)
+ q->vars.avg_dq_rate = count;
+ else
+ q->vars.avg_dq_rate =
+ (q->vars.avg_dq_rate -
+ (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+
+ /* If the queue has receded below the threshold, we hold
+ * on to the last drain rate calculated, else we reset
+ * dq_count to 0 to re-enter the if block when the next
+ * packet is dequeued
+ */
+ if (qlen < QUEUE_THRESHOLD)
+ q->vars.dq_count = DQCOUNT_INVALID;
+ else {
+ q->vars.dq_count = 0;
+ q->vars.dq_tstamp = psched_get_time();
+ }
+
+ if (q->vars.burst_time > 0) {
+ if (q->vars.burst_time > dtime)
+ q->vars.burst_time -= dtime;
+ else
+ q->vars.burst_time = 0;
+ }
+ }
+ }
+}
+
+static void calculate_probability(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 qlen = sch->qstats.backlog; /* queue size in bytes */
+ psched_time_t qdelay = 0; /* in pschedtime */
+ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ s32 delta = 0; /* determines the change in probability */
+ u32 oldprob;
+ u32 alpha, beta;
+ bool update_prob = true;
+
+ q->vars.qdelay_old = q->vars.qdelay;
+
+ if (q->vars.avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+ else
+ qdelay = 0;
+
+ /* If qdelay is zero and qlen is not, it means qlen is very small, less
+ * than dequeue_rate, so we do not update probabilty in this round
+ */
+ if (qdelay == 0 && qlen != 0)
+ update_prob = false;
+
+ /* In the algorithm, alpha and beta are between 0 and 2 with typical
+ * value for alpha as 0.125. In this implementation, we use values 0-32
+ * passed from user space to represent this. Also, alpha and beta have
+ * unit of HZ and need to be scaled before they can used to update
+ * probability. alpha/beta are updated locally below by 1) scaling them
+ * appropriately 2) scaling down by 16 to come to 0-2 range.
+ * Please see paper for details.
+ *
+ * We scale alpha and beta differently depending on whether we are in
+ * light, medium or high dropping mode.
+ */
+ if (q->vars.prob < MAX_PROB / 100) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ } else if (q->vars.prob < MAX_PROB / 10) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ } else {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ }
+
+ /* alpha and beta should be between 0 and 32, in multiples of 1/16 */
+ delta += alpha * ((qdelay - q->params.target));
+ delta += beta * ((qdelay - qdelay_old));
+
+ oldprob = q->vars.prob;
+
+ /* to ensure we increase probability in steps of no more than 2% */
+ if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+ q->vars.prob >= MAX_PROB / 10)
+ delta = (MAX_PROB / 100) * 2;
+
+ /* Non-linear drop:
+ * Tune drop probability to increase quickly for high delays(>= 250ms)
+ * 250ms is derived through experiments and provides error protection
+ */
+
+ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
+ delta += MAX_PROB / (100 / 2);
+
+ q->vars.prob += delta;
+
+ if (delta > 0) {
+ /* prevent overflow */
+ if (q->vars.prob < oldprob) {
+ q->vars.prob = MAX_PROB;
+ /* Prevent normalization error. If probability is at
+ * maximum value already, we normalize it here, and
+ * skip the check to do a non-linear drop in the next
+ * section.
+ */
+ update_prob = false;
+ }
+ } else {
+ /* prevent underflow */
+ if (q->vars.prob > oldprob)
+ q->vars.prob = 0;
+ }
+
+ /* Non-linear drop in probability: Reduce drop probability quickly if
+ * delay is 0 for 2 consecutive Tupdate periods.
+ */
+
+ if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+ q->vars.prob = (q->vars.prob * 98) / 100;
+
+ q->vars.qdelay = qdelay;
+ q->vars.qlen_old = qlen;
+
+ /* We restart the measurement cycle if the following conditions are met
+ * 1. If the delay has been low for 2 consecutive Tupdate periods
+ * 2. Calculated drop probability is zero
+ * 3. We have atleast one estimate for the avg_dq_rate ie.,
+ * is a non-zero value
+ */
+ if ((q->vars.qdelay < q->params.target / 2) &&
+ (q->vars.qdelay_old < q->params.target / 2) &&
+ (q->vars.prob == 0) &&
+ (q->vars.avg_dq_rate > 0))
+ pie_vars_init(&q->vars);
+}
+
+static void pie_timer(struct timer_list *t)
+{
+ struct pie_sched_data *q = from_timer(q, t, adapt_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ calculate_probability(sch);
+
+ /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
+ if (q->params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
+ spin_unlock(root_lock);
+
+}
+
+static int pie_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+
+ pie_params_init(&q->params);
+ pie_vars_init(&q->vars);
+ sch->limit = q->params.limit;
+
+ q->sch = sch;
+ timer_setup(&q->adapt_timer, pie_timer, 0);
+
+ if (opt) {
+ int err = pie_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+ return 0;
+}
+
+static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_PIE_TARGET,
+ ((u32) PSCHED_TICKS2NS(q->params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+ nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
+ nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+ nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+
+}
+
+static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct tc_pie_xstats st = {
+ .prob = q->vars.prob,
+ .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+ NSEC_PER_USEC,
+ /* unscale and return dq_rate in bytes per sec */
+ .avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .maxq = q->stats.maxq,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ skb = qdisc_dequeue_head(sch);
+
+ if (!skb)
+ return NULL;
+
+ pie_process_dequeue(sch, skb);
+ return skb;
+}
+
+static void pie_reset(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ qdisc_reset_queue(sch);
+ pie_vars_init(&q->vars);
+}
+
+static void pie_destroy(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ q->params.tupdate = 0;
+ del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
+ .id = "pie",
+ .priv_size = sizeof(struct pie_sched_data),
+ .enqueue = pie_qdisc_enqueue,
+ .dequeue = pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = pie_init,
+ .destroy = pie_destroy,
+ .reset = pie_reset,
+ .change = pie_change,
+ .dump = pie_dump,
+ .dump_stats = pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init pie_module_init(void)
+{
+ return register_qdisc(&pie_qdisc_ops);
+}
+
+static void __exit pie_module_exit(void)
+{
+ unregister_qdisc(&pie_qdisc_ops);
+}
+
+module_init(pie_module_init);
+module_exit(pie_module_exit);
+
+MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
+MODULE_AUTHOR("Vijay Subramanian");
+MODULE_AUTHOR("Mythili Prabhu");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 000000000..5619d2eb1
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,235 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ * Output commit property is commonly used by applications using checkpoint
+ * based fault-tolerance to ensure that the checkpoint from which a system
+ * is being restored is consistent w.r.t outside world.
+ *
+ * Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ * asynchronously to the backup host, while the VM continues executing the
+ * next epoch speculatively.
+ *
+ * The following is a typical sequence of output buffer operations:
+ * 1.At epoch i, start_buffer(i)
+ * 2. At end of epoch i (i.e. after 50ms):
+ * 2.1 Stop VM and take checkpoint(i).
+ * 2.2 start_buffer(i+1) and Resume VM
+ * 3. While speculatively executing epoch(i+1), asynchronously replicate
+ * checkpoint(i) to backup host.
+ * 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ * Thus, this Qdisc would receive the following sequence of commands:
+ * TCQ_PLUG_BUFFER (epoch i)
+ * .. TCQ_PLUG_BUFFER (epoch i+1)
+ * ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ * ......TCQ_PLUG_BUFFER (epoch i+2)
+ * ........
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ * plug(i+1) plug(i) head
+ * ------------------+--------------------+---------------->
+ * | |
+ * | |
+ * pkts_current_epoch| pkts_last_epoch |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ * v v
+ *
+ */
+
+struct plug_sched_data {
+ /* If true, the dequeue function releases all packets
+ * from head to end of the queue. The queue turns into
+ * a pass-through queue for newly arriving packets.
+ */
+ bool unplug_indefinite;
+
+ bool throttled;
+
+ /* Queue Limit in bytes */
+ u32 limit;
+
+ /* Number of packets (output) from the current speculatively
+ * executing epoch.
+ */
+ u32 pkts_current_epoch;
+
+ /* Number of packets corresponding to the recently finished
+ * epoch. These will be released when we receive a
+ * TCQ_PLUG_RELEASE_ONE command. This command is typically
+ * issued after committing a checkpoint at the target.
+ */
+ u32 pkts_last_epoch;
+
+ /*
+ * Number of packets from the head of the queue, that can
+ * be released (committed checkpoint).
+ */
+ u32 pkts_to_release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+ if (!q->unplug_indefinite)
+ q->pkts_current_epoch++;
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (q->throttled)
+ return NULL;
+
+ if (!q->unplug_indefinite) {
+ if (!q->pkts_to_release) {
+ /* No more packets to dequeue. Block the queue
+ * and wait for the next release command.
+ */
+ q->throttled = true;
+ return NULL;
+ }
+ q->pkts_to_release--;
+ }
+
+ return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ q->pkts_current_epoch = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_to_release = 0;
+ q->unplug_indefinite = false;
+
+ if (opt == NULL) {
+ q->limit = qdisc_dev(sch)->tx_queue_len
+ * psched_mtu(qdisc_dev(sch));
+ } else {
+ struct tc_plug_qopt *ctl = nla_data(opt);
+
+ if (nla_len(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ }
+
+ q->throttled = true;
+ return 0;
+}
+
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ * buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ * to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ * Stop buffering packets until the next TCQ_PLUG_BUFFER
+ * command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+ struct tc_plug_qopt *msg;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ msg = nla_data(opt);
+ if (nla_len(opt) < sizeof(*msg))
+ return -EINVAL;
+
+ switch (msg->action) {
+ case TCQ_PLUG_BUFFER:
+ /* Save size of the current buffer */
+ q->pkts_last_epoch = q->pkts_current_epoch;
+ q->pkts_current_epoch = 0;
+ if (q->unplug_indefinite)
+ q->throttled = true;
+ q->unplug_indefinite = false;
+ break;
+ case TCQ_PLUG_RELEASE_ONE:
+ /* Add packets from the last complete buffer to the
+ * packets to be released set.
+ */
+ q->pkts_to_release += q->pkts_last_epoch;
+ q->pkts_last_epoch = 0;
+ q->throttled = false;
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_RELEASE_INDEFINITE:
+ q->unplug_indefinite = true;
+ q->pkts_to_release = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_current_epoch = 0;
+ q->throttled = false;
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_LIMIT:
+ /* Limit is supplied in bytes */
+ q->limit = msg->limit;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+ .id = "plug",
+ .priv_size = sizeof(struct plug_sched_data),
+ .enqueue = plug_enqueue,
+ .dequeue = plug_dequeue,
+ .peek = qdisc_peek_head,
+ .init = plug_init,
+ .change = plug_change,
+ .reset = qdisc_reset_queue,
+ .owner = THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+ return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+ unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000..0e6f34bd9
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,484 @@
+/*
+ * net/sched/sch_prio.c Simple 3-band priority "scheduler".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
+ * Init -- EINVAL when opt undefined
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct prio_sched_data {
+ int bands;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ u8 prio2band[TC_PRIO_MAX+1];
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+};
+
+
+static struct Qdisc *
+prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tcf_classify(skb, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return q->queues[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->bands)
+ return q->queues[q->prio2band[0]];
+
+ return q->queues[band];
+}
+
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = prio_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+#endif
+
+ ret = qdisc_enqueue(skb, qdisc, to_free);
+ if (ret == NET_XMIT_SUCCESS) {
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+}
+
+static struct sk_buff *prio_peek(struct Qdisc *sch)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ struct Qdisc *qdisc = q->queues[prio];
+ struct sk_buff *skb = qdisc->ops->peek(qdisc);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ struct Qdisc *qdisc = q->queues[prio];
+ struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ return NULL;
+
+}
+
+static void
+prio_reset(struct Qdisc *sch)
+{
+ int prio;
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ for (prio = 0; prio < q->bands; prio++)
+ qdisc_reset(q->queues[prio]);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_prio_qopt_offload opt = {
+ .handle = sch->handle,
+ .parent = sch->parent,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ if (qopt) {
+ opt.command = TC_PRIO_REPLACE;
+ opt.replace_params.bands = qopt->bands;
+ memcpy(&opt.replace_params.priomap, qopt->priomap,
+ TC_PRIO_MAX + 1);
+ opt.replace_params.qstats = &sch->qstats;
+ } else {
+ opt.command = TC_PRIO_DESTROY;
+ }
+
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt);
+}
+
+static void
+prio_destroy(struct Qdisc *sch)
+{
+ int prio;
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ prio_offload(sch, NULL);
+ for (prio = 0; prio < q->bands; prio++)
+ qdisc_put(q->queues[prio]);
+}
+
+static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+ int oldbands = q->bands, i;
+ struct tc_prio_qopt *qopt;
+
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+ return -EINVAL;
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (qopt->priomap[i] >= qopt->bands)
+ return -EINVAL;
+ }
+
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < qopt->bands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1),
+ extack);
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_put(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
+ prio_offload(sch, qopt);
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+
+ for (i = q->bands; i < oldbands; i++) {
+ struct Qdisc *child = q->queues[i];
+
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
+ qdisc_put(child);
+ }
+
+ for (i = oldbands; i < q->bands; i++) {
+ q->queues[i] = queues[i];
+ if (q->queues[i] != &noop_qdisc)
+ qdisc_hash_add(q->queues[i], true);
+ }
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int prio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ return prio_tune(sch, opt, extack);
+}
+
+static int prio_dump_offload(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_prio_qopt_offload hw_stats = {
+ .command = TC_PRIO_STATS,
+ .handle = sch->handle,
+ .parent = sch->parent,
+ {
+ .stats = {
+ .bstats = &sch->bstats,
+ .qstats = &sch->qstats,
+ },
+ },
+ };
+ int err;
+
+ sch->flags &= ~TCQ_F_OFFLOADED;
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return 0;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
+ &hw_stats);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (!err)
+ sch->flags |= TCQ_F_OFFLOADED;
+
+ return err;
+}
+
+static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_prio_qopt opt;
+ int err;
+
+ opt.bands = q->bands;
+ memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
+
+ err = prio_dump_offload(sch);
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ struct tc_prio_qopt_offload graft_offload;
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long band = arg - 1;
+ bool any_qdisc_is_offloaded;
+ int err;
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, arg), extack);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
+
+ *old = qdisc_replace(sch, new, &q->queues[band]);
+
+ if (!tc_can_offload(dev))
+ return 0;
+
+ graft_offload.handle = sch->handle;
+ graft_offload.parent = sch->parent;
+ graft_offload.graft_params.band = band;
+ graft_offload.graft_params.child_handle = new->handle;
+ graft_offload.command = TC_PRIO_GRAFT;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
+ &graft_offload);
+
+ /* Don't report error if the graft is part of destroy operation. */
+ if (err && new != &noop_qdisc) {
+ /* Don't report error if the parent, the old child and the new
+ * one are not offloaded.
+ */
+ any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
+ if (*old)
+ any_qdisc_is_offloaded |= (*old)->flags &
+ TCQ_F_OFFLOADED;
+
+ if (any_qdisc_is_offloaded)
+ NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+ }
+
+ return 0;
+}
+
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ return q->queues[band];
+}
+
+static unsigned long prio_find(struct Qdisc *sch, u32 classid)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ return prio_find(sch, classid);
+}
+
+
+static void prio_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+ struct tcmsg *tcm)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = q->queues[cl-1]->handle;
+ return 0;
+}
+
+static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ if (arg->stop)
+ return;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, prio + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static const struct Qdisc_class_ops prio_class_ops = {
+ .graft = prio_graft,
+ .leaf = prio_leaf,
+ .find = prio_find,
+ .walk = prio_walk,
+ .tcf_block = prio_tcf_block,
+ .bind_tcf = prio_bind,
+ .unbind_tcf = prio_unbind,
+ .dump = prio_dump_class,
+ .dump_stats = prio_dump_class_stats,
+};
+
+static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &prio_class_ops,
+ .id = "prio",
+ .priv_size = sizeof(struct prio_sched_data),
+ .enqueue = prio_enqueue,
+ .dequeue = prio_dequeue,
+ .peek = prio_peek,
+ .init = prio_init,
+ .reset = prio_reset,
+ .destroy = prio_destroy,
+ .change = prio_tune,
+ .dump = prio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init prio_module_init(void)
+{
+ return register_qdisc(&prio_qdisc_ops);
+}
+
+static void __exit prio_module_exit(void)
+{
+ unregister_qdisc(&prio_qdisc_ops);
+}
+
+module_init(prio_module_init)
+module_exit(prio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
new file mode 100644
index 000000000..20dc1851d
--- /dev/null
+++ b/net/sched/sch_qfq.c
@@ -0,0 +1,1532 @@
+/*
+ * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
+ *
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ * Copyright (c) 2012 Paolo Valente.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/* Quick Fair Queueing Plus
+ ========================
+
+ Sources:
+
+ [1] Paolo Valente,
+ "Reducing the Execution Time of Fair-Queueing Schedulers."
+ http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
+
+ Sources for QFQ:
+
+ [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+ Packet Scheduling with Tight Bandwidth Distribution Guarantees."
+
+ See also:
+ http://retis.sssup.it/~fabio/linux/qfq/
+ */
+
+/*
+
+ QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
+ classes. Each aggregate is timestamped with a virtual start time S
+ and a virtual finish time F, and scheduled according to its
+ timestamps. S and F are computed as a function of a system virtual
+ time function V. The classes within each aggregate are instead
+ scheduled with DRR.
+
+ To speed up operations, QFQ+ divides also aggregates into a limited
+ number of groups. Which group a class belongs to depends on the
+ ratio between the maximum packet length for the class and the weight
+ of the class. Groups have their own S and F. In the end, QFQ+
+ schedules groups, then aggregates within groups, then classes within
+ aggregates. See [1] and [2] for a full description.
+
+ Virtual time computations.
+
+ S, F and V are all computed in fixed point arithmetic with
+ FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+ The max group index corresponds to Lmax/w_min, where
+ Lmax=1<<MTU_SHIFT, w_min = 1 .
+ From this, and knowing how many groups (MAX_INDEX) we want,
+ we can derive the shift corresponding to each group.
+
+ Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+ instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+ so we can do F = S + len * inv_w * wsum.
+ We use W_TOT in the formulas so we can easily move between
+ static and adaptive weight sum.
+
+ The per-scheduler-instance data contain all the data structures
+ for the scheduler: bitmaps and bucket lists.
+
+ */
+
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group.
+ */
+#define QFQ_MAX_SLOTS 32
+
+/*
+ * Shifts used for aggregate<->group mapping. We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
+ * group with the smallest index that can support the L_i / r_i configured
+ * for the classes in the aggregate.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ */
+#define QFQ_MAX_INDEX 24
+#define QFQ_MAX_WSHIFT 10
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
+#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
+#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
+
+#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
+
+/*
+ * Possible group states. These values are used as indexes for the bitmaps
+ * array of struct qfq_queue.
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+
+struct qfq_aggregate;
+
+struct qfq_class {
+ struct Qdisc_class_common common;
+
+ unsigned int filter_cnt;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct net_rate_estimator __rcu *rate_est;
+ struct Qdisc *qdisc;
+ struct list_head alist; /* Link for active-classes list. */
+ struct qfq_aggregate *agg; /* Parent aggregate. */
+ int deficit; /* DRR deficit counter. */
+};
+
+struct qfq_aggregate {
+ struct hlist_node next; /* Link for the slot list. */
+ u64 S, F; /* flow timestamps (exact) */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ u32 class_weight; /* Weight of each class in this aggregate. */
+ /* Max pkt size for the classes in this aggregate, DRR quantum. */
+ int lmax;
+
+ u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
+ u32 budgetmax; /* Max budget for this aggregate. */
+ u32 initial_budget, budget; /* Initial and current budget. */
+
+ int num_classes; /* Number of classes in this aggr. */
+ struct list_head active; /* DRR queue of active classes. */
+
+ struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
+};
+
+struct qfq_group {
+ u64 S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ unsigned long full_slots; /* non-empty slots */
+
+ /* Array of RR lists of active aggregates. */
+ struct hlist_head slots[QFQ_MAX_SLOTS];
+};
+
+struct qfq_sched {
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ struct Qdisc_class_hash clhash;
+
+ u64 oldV, V; /* Precise virtual times. */
+ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
+ u32 wsum; /* weight sum */
+ u32 iwsum; /* inverse weight sum */
+
+ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+ u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
+
+ u32 max_agg_classes; /* Max number of classes per aggr. */
+ struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
+};
+
+/*
+ * Possible reasons why the timestamps of an aggregate are updated
+ * enqueue: the aggregate switches from idle to active and must scheduled
+ * for service
+ * requeue: the aggregate finishes its budget, so it stops being served and
+ * must be rescheduled for service
+ */
+enum update_reason {enqueue, requeue};
+
+static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct qfq_class, common);
+}
+
+static void qfq_purge_queue(struct qfq_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
+}
+
+static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
+ [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
+ [TCA_QFQ_LMAX] = { .type = NLA_U32 },
+};
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
+{
+ u64 slot_size = (u64)maxlen * inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = slot_size >> min_slot_shift;
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; /* basically a log_2 */
+ index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
+
+ if (index < 0)
+ index = 0;
+out:
+ pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
+ (unsigned long) ONE_FP/inv_w, maxlen, index);
+
+ return index;
+}
+
+static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
+static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
+ enum update_reason);
+
+static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ u32 lmax, u32 weight)
+{
+ INIT_LIST_HEAD(&agg->active);
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ agg->lmax = lmax;
+ agg->class_weight = weight;
+}
+
+static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
+ u32 lmax, u32 weight)
+{
+ struct qfq_aggregate *agg;
+
+ hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
+ if (agg->lmax == lmax && agg->class_weight == weight)
+ return agg;
+
+ return NULL;
+}
+
+
+/* Update aggregate as a function of the new number of classes. */
+static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ int new_num_classes)
+{
+ u32 new_agg_weight;
+
+ if (new_num_classes == q->max_agg_classes)
+ hlist_del_init(&agg->nonfull_next);
+
+ if (agg->num_classes > new_num_classes &&
+ new_num_classes == q->max_agg_classes - 1) /* agg no more full */
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ /* The next assignment may let
+ * agg->initial_budget > agg->budgetmax
+ * hold, we will take it into account in charge_actual_service().
+ */
+ agg->budgetmax = new_num_classes * agg->lmax;
+ new_agg_weight = agg->class_weight * new_num_classes;
+ agg->inv_w = ONE_FP/new_agg_weight;
+
+ if (agg->grp == NULL) {
+ int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
+ q->min_slot_shift);
+ agg->grp = &q->groups[i];
+ }
+
+ q->wsum +=
+ (int) agg->class_weight * (new_num_classes - agg->num_classes);
+ q->iwsum = ONE_FP / q->wsum;
+
+ agg->num_classes = new_num_classes;
+}
+
+/* Add class to aggregate. */
+static void qfq_add_to_agg(struct qfq_sched *q,
+ struct qfq_aggregate *agg,
+ struct qfq_class *cl)
+{
+ cl->agg = agg;
+
+ qfq_update_agg(q, agg, agg->num_classes+1);
+ if (cl->qdisc->q.qlen > 0) { /* adding an active class */
+ list_add_tail(&cl->alist, &agg->active);
+ if (list_first_entry(&agg->active, struct qfq_class, alist) ==
+ cl && q->in_serv_agg != agg) /* agg was inactive */
+ qfq_activate_agg(q, agg, enqueue); /* schedule agg */
+ }
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
+
+static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ hlist_del_init(&agg->nonfull_next);
+ q->wsum -= agg->class_weight;
+ if (q->wsum != 0)
+ q->iwsum = ONE_FP / q->wsum;
+
+ if (q->in_serv_agg == agg)
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ kfree(agg);
+}
+
+/* Deschedule class from within its parent aggregate. */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+
+ list_del(&cl->alist); /* remove from RR queue of the aggregate */
+ if (list_empty(&agg->active)) /* agg is now inactive */
+ qfq_deactivate_agg(q, agg);
+}
+
+/* Remove class from its parent aggregate. */
+static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+ cl->agg = NULL;
+ if (agg->num_classes == 1) { /* agg being emptied, destroy it */
+ qfq_destroy_agg(q, agg);
+ return;
+ }
+ qfq_update_agg(q, agg, agg->num_classes-1);
+}
+
+/* Deschedule class and remove it from its parent aggregate. */
+static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ if (cl->qdisc->q.qlen > 0) /* class is active */
+ qfq_deactivate_class(q, cl);
+
+ qfq_rm_from_agg(q, cl);
+}
+
+/* Move class to a new aggregate, matching the new class weight and/or lmax */
+static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
+ u32 lmax)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+
+ if (new_agg == NULL) { /* create new aggregate */
+ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
+ if (new_agg == NULL)
+ return -ENOBUFS;
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ qfq_deact_rm_from_agg(q, cl);
+ qfq_add_to_agg(q, new_agg, cl);
+
+ return 0;
+}
+
+static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)*arg;
+ bool existing = false;
+ struct nlattr *tb[TCA_QFQ_MAX + 1];
+ struct qfq_aggregate *new_agg = NULL;
+ u32 weight, lmax, inv_w;
+ int err;
+ int delta_w;
+
+ if (tca[TCA_OPTIONS] == NULL) {
+ pr_notice("qfq: no options\n");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_QFQ_WEIGHT]) {
+ weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
+ if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
+ pr_notice("qfq: invalid weight %u\n", weight);
+ return -EINVAL;
+ }
+ } else
+ weight = 1;
+
+ if (tb[TCA_QFQ_LMAX]) {
+ lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
+ if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+ pr_notice("qfq: invalid max length %u\n", lmax);
+ return -EINVAL;
+ }
+ } else
+ lmax = psched_mtu(qdisc_dev(sch));
+
+ inv_w = ONE_FP / weight;
+ weight = ONE_FP / inv_w;
+
+ if (cl != NULL &&
+ lmax == cl->agg->lmax &&
+ weight == cl->agg->class_weight)
+ return 0; /* nothing to change */
+
+ delta_w = weight - (cl ? cl->agg->class_weight : 0);
+
+ if (q->wsum + delta_w > QFQ_MAX_WSUM) {
+ pr_notice("qfq: total weight out of range (%d + %u)\n",
+ delta_w, q->wsum);
+ return -EINVAL;
+ }
+
+ if (cl != NULL) { /* modify existing class */
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+ existing = true;
+ goto set_change_agg;
+ }
+
+ /* create and init new class */
+ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ cl->common.classid = classid;
+ cl->deficit = lmax;
+
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ classid, NULL);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ NULL,
+ qdisc_root_sleeping_running(sch),
+ tca[TCA_RATE]);
+ if (err)
+ goto destroy_class;
+ }
+
+ if (cl->qdisc != &noop_qdisc)
+ qdisc_hash_add(cl->qdisc, true);
+
+set_change_agg:
+ sch_tree_lock(sch);
+ new_agg = qfq_find_agg(q, lmax, weight);
+ if (new_agg == NULL) { /* create new aggregate */
+ sch_tree_unlock(sch);
+ new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
+ if (new_agg == NULL) {
+ err = -ENOBUFS;
+ gen_kill_estimator(&cl->rate_est);
+ goto destroy_class;
+ }
+ sch_tree_lock(sch);
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ if (existing)
+ qfq_deact_rm_from_agg(q, cl);
+ else
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ qfq_add_to_agg(q, new_agg, cl);
+ sch_tree_unlock(sch);
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+destroy_class:
+ qdisc_put(cl->qdisc);
+ kfree(cl);
+ return err;
+}
+
+static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ qfq_rm_from_agg(q, cl);
+ gen_kill_estimator(&cl->rate_est);
+ qdisc_put(cl->qdisc);
+ kfree(cl);
+}
+
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (cl->filter_cnt > 0)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ qfq_purge_queue(cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+ sch_tree_unlock(sch);
+
+ qfq_destroy_class(sch, cl);
+ return 0;
+}
+
+static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
+{
+ return (unsigned long)qfq_find_class(sch, classid);
+}
+
+static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+
+ return q->block;
+}
+
+static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct qfq_class *cl = qfq_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->filter_cnt++;
+
+ return (unsigned long)cl;
+}
+
+static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid, NULL);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ return 0;
+}
+
+static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ return cl->qdisc;
+}
+
+static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
+ nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct tc_qfq_stats xstats;
+
+ memset(&xstats, 0, sizeof(xstats));
+
+ xstats.weight = cl->agg->class_weight;
+ xstats.lmax = cl->agg->lmax;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL,
+ &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+ pr_debug("qfq_classify: found %d\n", skb->priority);
+ cl = qfq_find_class(sch, skb->priority);
+ if (cl != NULL)
+ return cl;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tcf_classify(skb, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct qfq_class *)res.class;
+ if (cl == NULL)
+ cl = qfq_find_class(sch, res.classid);
+ return cl;
+ }
+
+ return NULL;
+}
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline u64 qfq_round_down(u64 ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = __ffs(bitmap);
+ return &q->groups[index];
+}
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
+ int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_F))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= q->min_slot_shift;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static void qfq_make_eligible(struct qfq_sched *q)
+{
+ unsigned long vslot = q->V >> q->min_slot_shift;
+ unsigned long old_vslot = q->oldV >> q->min_slot_shift;
+
+ if (vslot != old_vslot) {
+ unsigned long mask;
+ int last_flip_pos = fls(vslot ^ old_vslot);
+
+ if (last_flip_pos > 31) /* higher than the number of groups */
+ mask = ~0UL; /* make all groups eligible */
+ else
+ mask = (1UL << last_flip_pos) - 1;
+
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
+ *
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
+ *
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated. In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
+ *
+ * Instead of delaying the activation of the new aggregate, which is
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
+ */
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
+ u64 roundedS)
+{
+ u64 slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i; /* slot index in the bucket list */
+
+ if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
+ u64 deltaS = roundedS - grp->S -
+ ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
+ agg->S -= deltaS;
+ agg->F -= deltaS;
+ slot = QFQ_MAX_SLOTS - 2;
+ }
+
+ i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ hlist_add_head(&agg->next, &grp->slots[i]);
+ __set_bit(slot, &grp->full_slots);
+}
+
+/* Maybe introduce hlist_first_entry?? */
+static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
+{
+ return hlist_entry(grp->slots[grp->front].first,
+ struct qfq_aggregate, next);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static void qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_aggregate *agg = qfq_slot_head(grp);
+
+ BUG_ON(!agg);
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[grp->front]))
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first aggregate in the first non-empty bucket of the
+ * group. As a side effect, adjusts the bucket list so the first
+ * non-empty bucket is at position 0 in full_slots.
+ */
+static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
+{
+ unsigned int i;
+
+ pr_debug("qfq slot_scan: grp %u full %#lx\n",
+ grp->index, grp->full_slots);
+
+ if (grp->full_slots == 0)
+ return NULL;
+
+ i = __ffs(grp->full_slots); /* zero based */
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return qfq_slot_head(grp);
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+static void qfq_update_eligible(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ unsigned long ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q);
+ }
+}
+
+/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
+static void agg_dequeue(struct qfq_aggregate *agg,
+ struct qfq_class *cl, unsigned int len)
+{
+ qdisc_dequeue_peeked(cl->qdisc);
+
+ cl->deficit -= (int) len;
+
+ if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
+ list_del(&cl->alist);
+ else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+ cl->deficit += agg->lmax;
+ list_move_tail(&cl->alist, &agg->active);
+ }
+}
+
+static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
+ struct qfq_class **cl,
+ unsigned int *len)
+{
+ struct sk_buff *skb;
+
+ *cl = list_first_entry(&agg->active, struct qfq_class, alist);
+ skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
+ if (skb == NULL)
+ WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+ else
+ *len = qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+/* Update F according to the actual service received by the aggregate. */
+static inline void charge_actual_service(struct qfq_aggregate *agg)
+{
+ /* Compute the service received by the aggregate, taking into
+ * account that, after decreasing the number of classes in
+ * agg, it may happen that
+ * agg->initial_budget - agg->budget > agg->bugdetmax
+ */
+ u32 service_received = min(agg->budgetmax,
+ agg->initial_budget - agg->budget);
+
+ agg->F = agg->S + (u64)service_received * agg->inv_w;
+}
+
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ unsigned long mask;
+ u64 limit, roundedF;
+ int slot_shift = agg->grp->slot_shift;
+
+ roundedF = qfq_round_down(agg->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+ if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], agg->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ if (qfq_gt(limit, next->F))
+ agg->S = next->F;
+ else /* preserve timestamp correctness */
+ agg->S = limit;
+ return;
+ }
+ }
+ agg->S = q->V;
+ } else /* timestamp is not stale */
+ agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service. In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+ struct qfq_aggregate *agg, enum update_reason reason)
+{
+ if (reason != requeue)
+ qfq_update_start(q, agg);
+ else /* just charge agg for the service received */
+ agg->S = agg->F;
+
+ agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
+
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
+ struct qfq_class *cl;
+ struct sk_buff *skb = NULL;
+ /* next-packet len, 0 means no more active classes in in-service agg */
+ unsigned int len = 0;
+
+ if (in_serv_agg == NULL)
+ return NULL;
+
+ if (!list_empty(&in_serv_agg->active))
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+
+ /*
+ * If there are no active classes in the in-service aggregate,
+ * or if the aggregate has not enough budget to serve its next
+ * class, then choose the next aggregate to serve.
+ */
+ if (len == 0 || in_serv_agg->budget < len) {
+ charge_actual_service(in_serv_agg);
+
+ /* recharge the budget of the aggregate */
+ in_serv_agg->initial_budget = in_serv_agg->budget =
+ in_serv_agg->budgetmax;
+
+ if (!list_empty(&in_serv_agg->active)) {
+ /*
+ * Still active: reschedule for
+ * service. Possible optimization: if no other
+ * aggregate is active, then there is no point
+ * in rescheduling this aggregate, and we can
+ * just keep it as the in-service one. This
+ * should be however a corner case, and to
+ * handle it, we would need to maintain an
+ * extra num_active_aggs field.
+ */
+ qfq_update_agg_ts(q, in_serv_agg, requeue);
+ qfq_schedule_agg(q, in_serv_agg);
+ } else if (sch->q.qlen == 0) { /* no aggregate to serve */
+ q->in_serv_agg = NULL;
+ return NULL;
+ }
+
+ /*
+ * If we get here, there are other aggregates queued:
+ * choose the new aggregate to serve.
+ */
+ in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+ }
+ if (!skb)
+ return NULL;
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+
+ agg_dequeue(in_serv_agg, cl, len);
+ /* If lmax is lowered, through qfq_change_class, for a class
+ * owning pending packets with larger size than the new value
+ * of lmax, then the following condition may hold.
+ */
+ if (unlikely(in_serv_agg->budget < len))
+ in_serv_agg->budget = 0;
+ else
+ in_serv_agg->budget -= len;
+
+ q->V += (u64)len * q->iwsum;
+ pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+ len, (unsigned long long) in_serv_agg->F,
+ (unsigned long long) q->V);
+
+ return skb;
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ struct qfq_aggregate *agg, *new_front_agg;
+ u64 old_F;
+
+ qfq_update_eligible(q);
+ q->oldV = q->V;
+
+ if (!q->bitmaps[ER])
+ return NULL;
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+ old_F = grp->F;
+
+ agg = qfq_slot_head(grp);
+
+ /* agg starts to be served, remove it from schedule */
+ qfq_front_slot_remove(grp);
+
+ new_front_agg = qfq_slot_scan(grp);
+
+ if (new_front_agg == NULL) /* group is now inactive, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ else {
+ u64 roundedS = qfq_round_down(new_front_agg->S,
+ grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ return agg;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+
+ qfq_unblock_groups(q, grp->index, old_F);
+
+ return agg;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct qfq_aggregate *agg;
+ int err = 0;
+
+ cl = qfq_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+ pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
+
+ if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+ pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
+ cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
+ err = qfq_change_agg(sch, cl, cl->agg->class_weight,
+ qdisc_pkt_len(skb));
+ if (err) {
+ cl->qstats.drops++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ pr_debug("qfq_enqueue: enqueue failed %d\n", err);
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ bstats_update(&cl->bstats, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ ++sch->q.qlen;
+
+ agg = cl->agg;
+ /* if the queue was not empty, then done here */
+ if (cl->qdisc->q.qlen != 1) {
+ if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
+ list_first_entry(&agg->active, struct qfq_class, alist)
+ == cl && cl->deficit < qdisc_pkt_len(skb))
+ list_move_tail(&cl->alist, &agg->active);
+
+ return err;
+ }
+
+ /* schedule class for service within the aggregate */
+ cl->deficit = agg->lmax;
+ list_add_tail(&cl->alist, &agg->active);
+
+ if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
+ q->in_serv_agg == agg)
+ return err; /* non-empty or in service, nothing else to do */
+
+ qfq_activate_agg(q, agg, enqueue);
+
+ return err;
+}
+
+/*
+ * Schedule aggregate according to its timestamps.
+ */
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ u64 roundedS;
+ int s;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+
+ /*
+ * Insert agg in the correct bucket.
+ * If agg->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, agg->S))
+ goto skip_update;
+
+ /* create a slot for this agg->S */
+ qfq_slot_rotate(grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
+ q->in_serv_agg == NULL)
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+
+ pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
+ s, q->bitmaps[s],
+ (unsigned long long) agg->S,
+ (unsigned long long) agg->F,
+ (unsigned long long) q->V);
+
+skip_update:
+ qfq_slot_insert(grp, agg, roundedS);
+}
+
+
+/* Update agg ts and schedule agg for service */
+static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ enum update_reason reason)
+{
+ agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
+
+ qfq_update_agg_ts(q, agg, reason);
+ if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
+ q->in_serv_agg = agg; /* start serving this aggregate */
+ /* update V: to be in service, agg must be eligible */
+ q->oldV = q->V = agg->S;
+ } else if (agg != q->in_serv_agg)
+ qfq_schedule_agg(q, agg);
+}
+
+static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_aggregate *agg)
+{
+ unsigned int i, offset;
+ u64 roundedS;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[i]))
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * Called to forcibly deschedule an aggregate. If the aggregate is
+ * not in the front bucket, or if the latter has other aggregates in
+ * the front bucket, we can simply remove the aggregate with no other
+ * side effects.
+ * Otherwise we must propagate the event up.
+ */
+static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ unsigned long mask;
+ u64 roundedS;
+ int s;
+
+ if (agg == q->in_serv_agg) {
+ charge_actual_service(agg);
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ return;
+ }
+
+ agg->F = agg->S;
+ qfq_slot_remove(q, grp, agg);
+
+ if (!grp->full_slots) {
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (hlist_empty(&grp->slots[grp->front])) {
+ agg = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+}
+
+static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ qfq_deactivate_class(q, cl);
+}
+
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_group *grp;
+ int i, j, err;
+ u32 max_cl_shift, maxbudg_shift, max_classes;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+
+ max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1,
+ QFQ_MAX_AGG_CLASSES);
+ /* max_cl_shift = floor(log_2(max_classes)) */
+ max_cl_shift = __fls(max_classes);
+ q->max_agg_classes = 1<<max_cl_shift;
+
+ /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
+ maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
+ q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = q->min_slot_shift + i;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++)
+ INIT_HLIST_HEAD(&grp->slots[j]);
+ }
+
+ INIT_HLIST_HEAD(&q->nonfull_aggs);
+
+ return 0;
+}
+
+static void qfq_reset_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->qdisc->q.qlen > 0)
+ qfq_deactivate_class(q, cl);
+
+ qdisc_reset(cl->qdisc);
+ }
+ }
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void qfq_destroy_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct hlist_node *next;
+ unsigned int i;
+
+ tcf_block_put(q->block);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode) {
+ qfq_destroy_class(sch, cl);
+ }
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops qfq_class_ops = {
+ .change = qfq_change_class,
+ .delete = qfq_delete_class,
+ .find = qfq_search_class,
+ .tcf_block = qfq_tcf_block,
+ .bind_tcf = qfq_bind_tcf,
+ .unbind_tcf = qfq_unbind_tcf,
+ .graft = qfq_graft_class,
+ .leaf = qfq_class_leaf,
+ .qlen_notify = qfq_qlen_notify,
+ .dump = qfq_dump_class,
+ .dump_stats = qfq_dump_class_stats,
+ .walk = qfq_walk,
+};
+
+static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
+ .cl_ops = &qfq_class_ops,
+ .id = "qfq",
+ .priv_size = sizeof(struct qfq_sched),
+ .enqueue = qfq_enqueue,
+ .dequeue = qfq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = qfq_init_qdisc,
+ .reset = qfq_reset_qdisc,
+ .destroy = qfq_destroy_qdisc,
+ .owner = THIS_MODULE,
+};
+
+static int __init qfq_init(void)
+{
+ return register_qdisc(&qfq_qdisc_ops);
+}
+
+static void __exit qfq_exit(void)
+{
+ unregister_qdisc(&qfq_qdisc_ops);
+}
+
+module_init(qfq_init);
+module_exit(qfq_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000..0424aa747
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,455 @@
+/*
+ * net/sched/sch_red.c Random Early Detection queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * J Hadi Salim 980914: computation fixes
+ * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
+ * J Hadi Salim 980816: ECN support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+
+/* Parameters, settable by user:
+ -----------------------------
+
+ limit - bytes (must be > qth_max + burst)
+
+ Hard limit on queue length, should be chosen >qth_max
+ to allow packet bursts. This parameter does not
+ affect the algorithms behaviour and can be chosen
+ arbitrarily high (well, less than ram size)
+ Really, this limit will never be reached
+ if RED works correctly.
+ */
+
+struct red_sched_data {
+ u32 limit; /* HARD maximal queue length */
+ unsigned char flags;
+ struct timer_list adapt_timer;
+ struct Qdisc *sch;
+ struct red_parms parms;
+ struct red_vars vars;
+ struct red_stats stats;
+ struct Qdisc *qdisc;
+};
+
+static inline int red_use_ecn(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+static inline int red_use_harddrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ int ret;
+
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ child->qstats.backlog);
+
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ break;
+ }
+
+ ret = qdisc_enqueue(skb, child, to_free);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.pdrop++;
+ qdisc_qstats_drop(sch);
+ }
+ return ret;
+
+congestion_drop:
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ skb = child->dequeue(child);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ } else {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ }
+ return skb;
+}
+
+static struct sk_buff *red_peek(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+static void red_reset(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+ red_restart(&q->vars);
+}
+
+static int red_offload(struct Qdisc *sch, bool enable)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_red_qopt_offload opt = {
+ .handle = sch->handle,
+ .parent = sch->parent,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ if (enable) {
+ opt.command = TC_RED_REPLACE;
+ opt.set.min = q->parms.qth_min >> q->parms.Wlog;
+ opt.set.max = q->parms.qth_max >> q->parms.Wlog;
+ opt.set.probability = q->parms.max_P;
+ opt.set.is_ecn = red_use_ecn(q);
+ opt.set.qstats = &sch->qstats;
+ } else {
+ opt.command = TC_RED_DESTROY;
+ }
+
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+}
+
+static void red_destroy(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ del_timer_sync(&q->adapt_timer);
+ red_offload(sch, false);
+ qdisc_put(q->qdisc);
+}
+
+static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_RED_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_RED_MAX_P] = { .type = NLA_U32 },
+};
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ struct tc_red_qopt *ctl;
+ struct Qdisc *child = NULL;
+ int err;
+ u32 max_P;
+ u8 *stab;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_RED_PARMS] == NULL ||
+ tb[TCA_RED_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+
+ ctl = nla_data(tb[TCA_RED_PARMS]);
+ stab = nla_data(tb[TCA_RED_STAB]);
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Scell_log, stab))
+ return -EINVAL;
+
+ if (ctl->limit > 0) {
+ child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit,
+ extack);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ /* child is fifo, no need to check for noop_qdisc */
+ qdisc_hash_add(child, true);
+ }
+
+ sch_tree_lock(sch);
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+ if (child) {
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
+ qdisc_put(q->qdisc);
+ q->qdisc = child;
+ }
+
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ stab,
+ max_P);
+ red_set_vars(&q->vars);
+
+ del_timer(&q->adapt_timer);
+ if (ctl->flags & TC_RED_ADAPTATIVE)
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+
+ if (!q->qdisc->q.qlen)
+ red_start_of_idle_period(&q->vars);
+
+ sch_tree_unlock(sch);
+ red_offload(sch, true);
+ return 0;
+}
+
+static inline void red_adaptative_timer(struct timer_list *t)
+{
+ struct red_sched_data *q = from_timer(q, t, adapt_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ red_adaptative_algo(&q->parms, &q->vars);
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+ spin_unlock(root_lock);
+}
+
+static int red_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ q->sch = sch;
+ timer_setup(&q->adapt_timer, red_adaptative_timer, 0);
+ return red_change(sch, opt, extack);
+}
+
+static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_red_qopt_offload hw_stats = {
+ .command = TC_RED_STATS,
+ .handle = sch->handle,
+ .parent = sch->parent,
+ {
+ .stats.bstats = &sch->bstats,
+ .stats.qstats = &sch->qstats,
+ },
+ };
+ int err;
+
+ sch->flags &= ~TCQ_F_OFFLOADED;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return 0;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+ &hw_stats);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (!err)
+ sch->flags |= TCQ_F_OFFLOADED;
+
+ return err;
+}
+
+static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+ int err;
+
+ err = red_dump_offload_stats(sch, &opt);
+ if (err)
+ goto nla_put_failure;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_red_xstats st = {0};
+
+ if (sch->flags & TCQ_F_OFFLOADED) {
+ struct tc_red_qopt_offload hw_stats_request = {
+ .command = TC_RED_XSTATS,
+ .handle = sch->handle,
+ .parent = sch->parent,
+ {
+ .xstats = &q->stats,
+ },
+ };
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+ &hw_stats_request);
+ }
+ st.early = q->stats.prob_drop + q->stats.forced_drop;
+ st.pdrop = q->stats.pdrop;
+ st.other = q->stats.other;
+ st.marked = q->stats.prob_mark + q->stats.forced_mark;
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int red_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+ return 0;
+}
+
+static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long red_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops red_class_ops = {
+ .graft = red_graft,
+ .leaf = red_leaf,
+ .find = red_find,
+ .walk = red_walk,
+ .dump = red_dump_class,
+};
+
+static struct Qdisc_ops red_qdisc_ops __read_mostly = {
+ .id = "red",
+ .priv_size = sizeof(struct red_sched_data),
+ .cl_ops = &red_class_ops,
+ .enqueue = red_enqueue,
+ .dequeue = red_dequeue,
+ .peek = red_peek,
+ .init = red_init,
+ .reset = red_reset,
+ .destroy = red_destroy,
+ .change = red_change,
+ .dump = red_dump,
+ .dump_stats = red_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init red_module_init(void)
+{
+ return register_qdisc(&red_qdisc_ops);
+}
+
+static void __exit red_module_exit(void)
+{
+ unregister_qdisc(&red_qdisc_ops);
+}
+
+module_init(red_module_init)
+module_exit(red_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 000000000..81d205acb
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,734 @@
+/*
+ * net/sched/sch_sfb.c Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/siphash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/inet_ecn.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+ u16 qlen; /* length of virtual queue */
+ u16 p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+ siphash_key_t perturbation; /* siphash key */
+ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+ struct Qdisc *qdisc;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ unsigned long rehash_interval;
+ unsigned long warmup_time; /* double buffering warmup time in jiffies */
+ u32 max;
+ u32 bin_size; /* maximum queue length per bin */
+ u32 increment; /* d1 */
+ u32 decrement; /* d2 */
+ u32 limit; /* HARD maximal queue length */
+ u32 penalty_rate;
+ u32 penalty_burst;
+ u32 tokens_avail;
+ unsigned long rehash_time;
+ unsigned long token_time;
+
+ u8 slot; /* current active bins (0 or 1) */
+ bool double_buffering;
+ struct sfb_bins bins[2];
+
+ struct {
+ u32 earlydrop;
+ u32 penaltydrop;
+ u32 bucketdrop;
+ u32 queuedrop;
+ u32 childdrop; /* drops in child qdisc */
+ u32 marked; /* ECN mark */
+ } stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+ u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
+ return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+ return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+ u32 res = p1 + p2;
+
+ return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+ return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen < 0xFFFF)
+ b[hash].qlen++;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+ struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen > 0)
+ b[hash].qlen--;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+ memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+ int i;
+ u32 qlen = 0, prob = 0, totalpm = 0;
+ const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+ if (qlen < b->qlen)
+ qlen = b->qlen;
+ totalpm += b->p_mark;
+ if (prob < b->p_mark)
+ prob = b->p_mark;
+ b++;
+ }
+ *prob_r = prob;
+ *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+ return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+ get_random_bytes(&q->bins[slot].perturbation,
+ sizeof(q->bins[slot].perturbation));
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+ sfb_init_perturbation(q->slot, q);
+ q->slot ^= 1;
+ q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ if (q->penalty_rate == 0 || q->penalty_burst == 0)
+ return true;
+
+ if (q->tokens_avail < 1) {
+ unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+ q->tokens_avail = (age * q->penalty_rate) / HZ;
+ if (q->tokens_avail > q->penalty_burst)
+ q->tokens_avail = q->penalty_burst;
+ q->token_time = jiffies;
+ if (q->tokens_avail < 1)
+ return true;
+ }
+
+ q->tokens_avail--;
+ return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
+ int *qerr, u32 *salt)
+{
+ struct tcf_result res;
+ int result;
+
+ result = tcf_classify(skb, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ *salt = TC_H_MIN(res.classid);
+ return true;
+ }
+ return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct tcf_proto *fl;
+ int i;
+ u32 p_min = ~0;
+ u32 minqlen = ~0;
+ u32 r, sfbhash;
+ u32 slot = q->slot;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (unlikely(sch->q.qlen >= q->limit)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.queuedrop++;
+ goto drop;
+ }
+
+ if (q->rehash_interval > 0) {
+ unsigned long limit = q->rehash_time + q->rehash_interval;
+
+ if (unlikely(time_after(jiffies, limit))) {
+ sfb_swap_slot(q);
+ q->rehash_time = jiffies;
+ } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+ time_after(jiffies, limit - q->warmup_time))) {
+ q->double_buffering = true;
+ }
+ }
+
+ fl = rcu_dereference_bh(q->filter_list);
+ if (fl) {
+ u32 salt;
+
+ /* If using external classifiers, get result and record it. */
+ if (!sfb_classify(skb, fl, &ret, &salt))
+ goto other_drop;
+ sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
+ } else {
+ sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
+ }
+
+
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ if (minqlen > b->qlen)
+ minqlen = b->qlen;
+ if (p_min > b->p_mark)
+ p_min = b->p_mark;
+ }
+
+ slot ^= 1;
+ sfb_skb_cb(skb)->hashes[slot] = 0;
+
+ if (unlikely(minqlen >= q->max)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.bucketdrop++;
+ goto drop;
+ }
+
+ if (unlikely(p_min >= SFB_MAX_PROB)) {
+ /* Inelastic flow */
+ if (q->double_buffering) {
+ sfbhash = skb_get_hash_perturb(skb,
+ &q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ }
+ }
+ if (sfb_rate_limit(skb, q)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.penaltydrop++;
+ goto drop;
+ }
+ goto enqueue;
+ }
+
+ r = prandom_u32() & SFB_MAX_PROB;
+
+ if (unlikely(r < p_min)) {
+ if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+ /* If we're marking that many packets, then either
+ * this flow is unresponsive, or we're badly congested.
+ * In either case, we want to start dropping packets.
+ */
+ if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.marked++;
+ } else {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+
+enqueue:
+ ret = qdisc_enqueue(skb, child, to_free);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ increment_qlen(skb, q);
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.childdrop++;
+ qdisc_qstats_drop(sch);
+ }
+ return ret;
+
+drop:
+ qdisc_drop(skb, sch, to_free);
+ return NET_XMIT_CN;
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct sk_buff *skb;
+
+ skb = child->dequeue(q->qdisc);
+
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ decrement_qlen(skb, q);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ qdisc_put(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+ [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+ .rehash_interval = 600 * MSEC_PER_SEC,
+ .warmup_time = 60 * MSEC_PER_SEC,
+ .limit = 0,
+ .max = 25,
+ .bin_size = 20,
+ .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+ .decrement = (SFB_MAX_PROB + 3000) / 6000,
+ .penalty_rate = 10,
+ .penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ struct nlattr *tb[TCA_SFB_MAX + 1];
+ const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+ u32 limit;
+ int err;
+
+ if (opt) {
+ err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy, NULL);
+ if (err < 0)
+ return -EINVAL;
+
+ if (tb[TCA_SFB_PARMS] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_SFB_PARMS]);
+ }
+
+ limit = ctl->limit;
+ if (limit == 0)
+ limit = qdisc_dev(sch)->tx_queue_len;
+
+ child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ if (child != &noop_qdisc)
+ qdisc_hash_add(child, true);
+ sch_tree_lock(sch);
+
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
+ qdisc_put(q->qdisc);
+ q->qdisc = child;
+
+ q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+ q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+ q->rehash_time = jiffies;
+ q->limit = limit;
+ q->increment = ctl->increment;
+ q->decrement = ctl->decrement;
+ q->max = ctl->max;
+ q->bin_size = ctl->bin_size;
+ q->penalty_rate = ctl->penalty_rate;
+ q->penalty_burst = ctl->penalty_burst;
+ q->tokens_avail = ctl->penalty_burst;
+ q->token_time = jiffies;
+
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+ sfb_init_perturbation(1, q);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ q->qdisc = &noop_qdisc;
+ return sfb_change(sch, opt, extack);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct tc_sfb_qopt opt = {
+ .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+ .warmup_time = jiffies_to_msecs(q->warmup_time),
+ .limit = q->limit,
+ .max = q->max,
+ .bin_size = q->bin_size,
+ .increment = q->increment,
+ .decrement = q->decrement,
+ .penalty_rate = q->penalty_rate,
+ .penalty_burst = q->penalty_burst,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tc_sfb_xstats st = {
+ .earlydrop = q->stats.earlydrop,
+ .penaltydrop = q->stats.penaltydrop,
+ .bucketdrop = q->stats.bucketdrop,
+ .queuedrop = q->stats.queuedrop,
+ .childdrop = q->stats.childdrop,
+ .marked = q->stats.marked,
+ };
+
+ st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long sfb_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void sfb_unbind(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+ return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+ .graft = sfb_graft,
+ .leaf = sfb_leaf,
+ .find = sfb_find,
+ .change = sfb_change_class,
+ .delete = sfb_delete,
+ .walk = sfb_walk,
+ .tcf_block = sfb_tcf_block,
+ .bind_tcf = sfb_bind,
+ .unbind_tcf = sfb_unbind,
+ .dump = sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+ .id = "sfb",
+ .priv_size = sizeof(struct sfb_sched_data),
+ .cl_ops = &sfb_class_ops,
+ .enqueue = sfb_enqueue,
+ .dequeue = sfb_dequeue,
+ .peek = sfb_peek,
+ .init = sfb_init,
+ .reset = sfb_reset,
+ .destroy = sfb_destroy,
+ .change = sfb_change,
+ .dump = sfb_dump,
+ .dump_stats = sfb_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+ return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+ unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000..1bfdf90fa
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,944 @@
+/*
+ * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/siphash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/red.h>
+
+
+/* Stochastic Fairness Queuing algorithm.
+ =======================================
+
+ Source:
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ "Interworking: Research and Experience", v.2, 1991, p.113-131.
+
+
+ See also:
+ M. Shreedhar and George Varghese "Efficient Fair
+ Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+
+
+ This is not the thing that is usually called (W)FQ nowadays.
+ It does not use any timestamp mechanism, but instead
+ processes queues in round-robin order.
+
+ ADVANTAGE:
+
+ - It is very cheap. Both CPU and memory requirements are minimal.
+
+ DRAWBACKS:
+
+ - "Stochastic" -> It is not 100% fair.
+ When hash collisions occur, several flows are considered as one.
+
+ - "Round-robin" -> It introduces larger delays than virtual clock
+ based schemes, and should not be used for isolating interactive
+ traffic from non-interactive. It means, that this scheduler
+ should be used as leaf of CBQ or P3, which put interactive traffic
+ to higher priority band.
+
+ We still need true WFQ for top level CSZ, but using WFQ
+ for the best effort traffic is absolutely pointless:
+ SFQ is superior for this purpose.
+
+ IMPLEMENTATION:
+ This implementation limits :
+ - maximal queue length per flow to 127 packets.
+ - max mtu to 2^18-1;
+ - max 65408 flows,
+ - number of hash buckets to 65536.
+
+ It is easy to increase these values, but not in flight. */
+
+#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
+#define SFQ_DEFAULT_FLOWS 128
+#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
+#define SFQ_EMPTY_SLOT 0xffff
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
+/* We use 16 bits to store allot, and want to handle packets up to 64K
+ * Scale allot by 8 (1<<3) so that no overflow occurs.
+ */
+#define SFQ_ALLOT_SHIFT 3
+#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
+
+/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
+typedef u16 sfq_index;
+
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
+ * are 'pointers' to dep[] array
+ */
+struct sfq_head {
+ sfq_index next;
+ sfq_index prev;
+};
+
+struct sfq_slot {
+ struct sk_buff *skblist_next;
+ struct sk_buff *skblist_prev;
+ sfq_index qlen; /* number of skbs in skblist */
+ sfq_index next; /* next slot in sfq RR chain */
+ struct sfq_head dep; /* anchor in dep[] chains */
+ unsigned short hash; /* hash value (index in ht[]) */
+ short allot; /* credit for this slot */
+
+ unsigned int backlog;
+ struct red_vars vars;
+};
+
+struct sfq_sched_data {
+/* frequently used fields */
+ int limit; /* limit of total number of packets in this qdisc */
+ unsigned int divisor; /* number of slots in hash table */
+ u8 headdrop;
+ u8 maxdepth; /* limit of packets per flow */
+
+ siphash_key_t perturbation;
+ u8 cur_depth; /* depth of longest slot */
+ u8 flags;
+ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ sfq_index *ht; /* Hash table ('divisor' slots) */
+ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
+
+ struct red_parms *red_parms;
+ struct tc_sfqred_stats stats;
+ struct sfq_slot *tail; /* current slot in round */
+
+ struct sfq_head dep[SFQ_MAX_DEPTH + 1];
+ /* Linked lists of slots, indexed by depth
+ * dep[0] : list of unused flows
+ * dep[1] : list of flows with 1 packet
+ * dep[X] : list of flows with X packets
+ */
+
+ unsigned int maxflows; /* number of flows in flows array */
+ int perturb_period;
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
+ struct timer_list perturb_timer;
+ struct Qdisc *sch;
+};
+
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
+{
+ if (val < SFQ_MAX_FLOWS)
+ return &q->slots[val].dep;
+ return &q->dep[val - SFQ_MAX_FLOWS];
+}
+
+static unsigned int sfq_hash(const struct sfq_sched_data *q,
+ const struct sk_buff *skb)
+{
+ return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
+}
+
+static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->divisor)
+ return TC_H_MIN(skb->priority);
+
+ fl = rcu_dereference_bh(q->filter_list);
+ if (!fl)
+ return sfq_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->divisor)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/*
+ * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
+ */
+static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ struct sfq_slot *slot = &q->slots[x];
+ int qlen = slot->qlen;
+
+ p = qlen + SFQ_MAX_FLOWS;
+ n = q->dep[qlen].next;
+
+ slot->dep.next = n;
+ slot->dep.prev = p;
+
+ q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
+ sfq_dep_head(q, n)->prev = x;
+}
+
+#define sfq_unlink(q, x, n, p) \
+ do { \
+ n = q->slots[x].dep.next; \
+ p = q->slots[x].dep.prev; \
+ sfq_dep_head(q, p)->next = n; \
+ sfq_dep_head(q, n)->prev = p; \
+ } while (0)
+
+
+static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d;
+
+ sfq_unlink(q, x, n, p);
+
+ d = q->slots[x].qlen--;
+ if (n == p && q->cur_depth == d)
+ q->cur_depth--;
+ sfq_link(q, x);
+}
+
+static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d;
+
+ sfq_unlink(q, x, n, p);
+
+ d = ++q->slots[x].qlen;
+ if (q->cur_depth < d)
+ q->cur_depth = d;
+ sfq_link(q, x);
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_prev;
+
+ slot->skblist_prev = skb->prev;
+ skb->prev->next = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_next;
+
+ slot->skblist_next = skb->next;
+ skb->next->prev = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+ memset(slot, 0, sizeof(*slot));
+ slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+ skb->prev = slot->skblist_prev;
+ skb->next = (struct sk_buff *)slot;
+ slot->skblist_prev->next = skb;
+ slot->skblist_prev = skb;
+}
+
+static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ sfq_index x, d = q->cur_depth;
+ struct sk_buff *skb;
+ unsigned int len;
+ struct sfq_slot *slot;
+
+ /* Queue is full! Find the longest slot and drop tail packet from it */
+ if (d > 1) {
+ x = q->dep[d].next;
+ slot = &q->slots[x];
+drop:
+ skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
+ len = qdisc_pkt_len(skb);
+ slot->backlog -= len;
+ sfq_dec(q, x);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch, to_free);
+ return len;
+ }
+
+ if (d == 1) {
+ /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+ x = q->tail->next;
+ slot = &q->slots[x];
+ q->tail->next = slot->next;
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ goto drop;
+ }
+
+ return 0;
+}
+
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+ return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+ return q->headdrop;
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned int hash, dropped;
+ sfq_index x, qlen;
+ struct sfq_slot *slot;
+ int uninitialized_var(ret);
+ struct sk_buff *head;
+ int delta;
+
+ hash = sfq_classify(skb, sch, &ret);
+ if (hash == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ hash--;
+
+ x = q->ht[hash];
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS)
+ return qdisc_drop(skb, sch, to_free);
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ slot->backlog = 0; /* should already be 0 anyway... */
+ red_set_vars(&slot->vars);
+ goto enqueue;
+ }
+ if (q->red_parms) {
+ slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ switch (red_action(q->red_parms,
+ &slot->vars,
+ slot->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (sfq_prob_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.prob_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ break;
+ }
+ }
+ q->stats.prob_drop++;
+ goto congestion_drop;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (sfq_hard_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.forced_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ break;
+ }
+ }
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ }
+
+ if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+ if (!sfq_headdrop(q))
+ return qdisc_drop(skb, sch, to_free);
+
+ /* We know we have at least one packet in queue */
+ head = slot_dequeue_head(slot);
+ delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+ sch->qstats.backlog -= delta;
+ slot->backlog -= delta;
+ qdisc_drop(head, sch, to_free);
+
+ slot_queue_add(slot, skb);
+ qdisc_tree_reduce_backlog(sch, 0, delta);
+ return NET_XMIT_CN;
+ }
+
+enqueue:
+ qdisc_qstats_backlog_inc(sch, skb);
+ slot->backlog += qdisc_pkt_len(skb);
+ slot_queue_add(slot, skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ /* We put this flow at the end of our flow list.
+ * This might sound unfair for a new flow to wait after old ones,
+ * but we could endup servicing new flows only, and freeze old ones.
+ */
+ q->tail = slot;
+ /* We could use a bigger initial quantum for new flows */
+ slot->allot = q->scaled_quantum;
+ }
+ if (++sch->q.qlen <= q->limit)
+ return NET_XMIT_SUCCESS;
+
+ qlen = slot->qlen;
+ dropped = sfq_drop(sch, to_free);
+ /* Return Congestion Notification only if we dropped a packet
+ * from this flow.
+ */
+ if (qlen != slot->qlen) {
+ qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
+ return NET_XMIT_CN;
+ }
+
+ /* As we dropped a packet, better let upper stack know this */
+ qdisc_tree_reduce_backlog(sch, 1, dropped);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+sfq_dequeue(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ sfq_index a, next_a;
+ struct sfq_slot *slot;
+
+ /* No active slots */
+ if (q->tail == NULL)
+ return NULL;
+
+next_slot:
+ a = q->tail->next;
+ slot = &q->slots[a];
+ if (slot->allot <= 0) {
+ q->tail = slot;
+ slot->allot += q->scaled_quantum;
+ goto next_slot;
+ }
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ slot->backlog -= qdisc_pkt_len(skb);
+ /* Is the slot empty? */
+ if (slot->qlen == 0) {
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ next_a = slot->next;
+ if (a == next_a) {
+ q->tail = NULL; /* no more active slots */
+ return skb;
+ }
+ q->tail->next = next_a;
+ } else {
+ slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
+ }
+ return skb;
+}
+
+static void
+sfq_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = sfq_dequeue(sch)) != NULL)
+ rtnl_kfree_skbs(skb, skb);
+}
+
+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int i;
+ struct sfq_slot *slot;
+ struct sk_buff_head list;
+ int dropped = 0;
+ unsigned int drop_len = 0;
+
+ __skb_queue_head_init(&list);
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot = &q->slots[i];
+ if (!slot->qlen)
+ continue;
+ while (slot->qlen) {
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, i);
+ __skb_queue_tail(&list, skb);
+ }
+ slot->backlog = 0;
+ red_set_vars(&slot->vars);
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ }
+ q->tail = NULL;
+
+ while ((skb = __skb_dequeue(&list)) != NULL) {
+ unsigned int hash = sfq_hash(q, skb);
+ sfq_index x = q->ht[hash];
+
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS) {
+drop:
+ qdisc_qstats_backlog_dec(sch, skb);
+ drop_len += qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ dropped++;
+ continue;
+ }
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ }
+ if (slot->qlen >= q->maxdepth)
+ goto drop;
+ slot_queue_add(slot, skb);
+ if (q->red_parms)
+ slot->vars.qavg = red_calc_qavg(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ slot->backlog += qdisc_pkt_len(skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ q->tail = slot;
+ slot->allot = q->scaled_quantum;
+ }
+ }
+ sch->q.qlen -= dropped;
+ qdisc_tree_reduce_backlog(sch, dropped, drop_len);
+}
+
+static void sfq_perturbation(struct timer_list *t)
+{
+ struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ siphash_key_t nkey;
+
+ get_random_bytes(&nkey, sizeof(nkey));
+ spin_lock(root_lock);
+ q->perturbation = nkey;
+ if (!q->filter_list && q->tail)
+ sfq_rehash(sch);
+ spin_unlock(root_lock);
+
+ if (q->perturb_period)
+ mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+}
+
+static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct tc_sfq_qopt *ctl = nla_data(opt);
+ struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
+ unsigned int qlen, dropped = 0;
+ struct red_parms *p = NULL;
+ struct sk_buff *to_free = NULL;
+ struct sk_buff *tail = NULL;
+
+ if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+ if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
+ ctl_v1 = nla_data(opt);
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+
+ /* slot->allot is a short, make sure quantum is not too big. */
+ if (ctl->quantum) {
+ unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
+
+ if (scaled <= 0 || scaled > SHRT_MAX)
+ return -EINVAL;
+ }
+
+ if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
+ return -EINVAL;
+ if (ctl_v1 && ctl_v1->qth_min) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ sch_tree_lock(sch);
+ if (ctl->quantum) {
+ q->quantum = ctl->quantum;
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ }
+ q->perturb_period = ctl->perturb_period * HZ;
+ if (ctl->flows)
+ q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ if (ctl->divisor) {
+ q->divisor = ctl->divisor;
+ q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ }
+ if (ctl_v1) {
+ if (ctl_v1->depth)
+ q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ if (p) {
+ swap(q->red_parms, p);
+ red_set_parms(q->red_parms,
+ ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog,
+ ctl_v1->Plog, ctl_v1->Scell_log,
+ NULL,
+ ctl_v1->max_P);
+ }
+ q->flags = ctl_v1->flags;
+ q->headdrop = ctl_v1->headdrop;
+ }
+ if (ctl->limit) {
+ q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+ q->maxflows = min_t(u32, q->maxflows, q->limit);
+ }
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > q->limit) {
+ dropped += sfq_drop(sch, &to_free);
+ if (!tail)
+ tail = to_free;
+ }
+
+ rtnl_kfree_skbs(to_free, tail);
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+
+ del_timer(&q->perturb_timer);
+ if (q->perturb_period) {
+ mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
+ }
+ sch_tree_unlock(sch);
+ kfree(p);
+ return 0;
+}
+
+static void *sfq_alloc(size_t sz)
+{
+ return kvmalloc(sz, GFP_KERNEL);
+}
+
+static void sfq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ q->perturb_period = 0;
+ del_timer_sync(&q->perturb_timer);
+ sfq_free(q->ht);
+ sfq_free(q->slots);
+ kfree(q->red_parms);
+}
+
+static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ int i;
+ int err;
+
+ q->sch = sch;
+ timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
+ q->dep[i].next = i + SFQ_MAX_FLOWS;
+ q->dep[i].prev = i + SFQ_MAX_FLOWS;
+ }
+
+ q->limit = SFQ_MAX_DEPTH;
+ q->maxdepth = SFQ_MAX_DEPTH;
+ q->cur_depth = 0;
+ q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
+ q->maxflows = SFQ_DEFAULT_FLOWS;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ q->perturb_period = 0;
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
+
+ if (opt) {
+ int err = sfq_change(sch, opt);
+ if (err)
+ return err;
+ }
+
+ q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
+ q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
+ if (!q->ht || !q->slots) {
+ /* Note: sfq_destroy() will be called by our caller */
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot_queue_init(&q->slots[i]);
+ sfq_link(q, i);
+ }
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_sfq_qopt_v1 opt;
+ struct red_parms *p = q->red_parms;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.v0.quantum = q->quantum;
+ opt.v0.perturb_period = q->perturb_period / HZ;
+ opt.v0.limit = q->limit;
+ opt.v0.divisor = q->divisor;
+ opt.v0.flows = q->maxflows;
+ opt.depth = q->maxdepth;
+ opt.headdrop = q->headdrop;
+
+ if (p) {
+ opt.qth_min = p->qth_min >> p->Wlog;
+ opt.qth_max = p->qth_max >> p->Wlog;
+ opt.Wlog = p->Wlog;
+ opt.Plog = p->Plog;
+ opt.Scell_log = p->Scell_log;
+ opt.max_P = p->max_P;
+ }
+ memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+ opt.flags = q->flags;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void sfq_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ sfq_index idx = q->ht[cl - 1];
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_sfq_xstats xstats = { 0 };
+
+ if (idx != SFQ_EMPTY_SLOT) {
+ const struct sfq_slot *slot = &q->slots[idx];
+
+ xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+ qs.qlen = slot->qlen;
+ qs.backlog = slot->backlog;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->divisor; i++) {
+ if (q->ht[i] == SFQ_EMPTY_SLOT ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops sfq_class_ops = {
+ .leaf = sfq_leaf,
+ .find = sfq_find,
+ .tcf_block = sfq_tcf_block,
+ .bind_tcf = sfq_bind,
+ .unbind_tcf = sfq_unbind,
+ .dump = sfq_dump_class,
+ .dump_stats = sfq_dump_class_stats,
+ .walk = sfq_walk,
+};
+
+static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
+ .cl_ops = &sfq_class_ops,
+ .id = "sfq",
+ .priv_size = sizeof(struct sfq_sched_data),
+ .enqueue = sfq_enqueue,
+ .dequeue = sfq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = sfq_init,
+ .reset = sfq_reset,
+ .destroy = sfq_destroy,
+ .change = NULL,
+ .dump = sfq_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfq_module_init(void)
+{
+ return register_qdisc(&sfq_qdisc_ops);
+}
+static void __exit sfq_module_exit(void)
+{
+ unregister_qdisc(&sfq_qdisc_ops);
+}
+module_init(sfq_module_init)
+module_exit(sfq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000000000..3d9de5284
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,323 @@
+/*
+ * net/sched/sch_skbprio.c SKB Priority Queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Nishanth Devarajan, <ndev2021@gmail.com>
+ * Cody Doucette, <doucette@bu.edu>
+ * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/* SKB Priority Queue
+ * =================================
+ *
+ * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes
+ * packets according to their skb->priority field. Under congestion,
+ * Skbprio drops already-enqueued lower priority packets to make space
+ * available for higher priority packets; it was conceived as a solution
+ * for denial-of-service defenses that need to route packets with different
+ * priorities as a mean to overcome DoS attacks.
+ */
+
+struct skbprio_sched_data {
+ /* Queue state. */
+ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
+ struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
+ u16 highest_prio;
+ u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return 0 (default highest priority setting). */
+ return 0;
+}
+
+static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
+ * (default lowest priority setting).
+ */
+ return SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *qdisc;
+ struct sk_buff_head *lp_qdisc;
+ struct sk_buff *to_drop;
+ u16 prio, lp;
+
+ /* Obtain the priority of @skb. */
+ prio = min(skb->priority, max_priority);
+
+ qdisc = &q->qdiscs[prio];
+ if (sch->q.qlen < sch->limit) {
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Check to update highest and lowest priorities. */
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ if (prio < q->lowest_prio)
+ q->lowest_prio = prio;
+
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+
+ /* If this packet has the lowest priority, drop it. */
+ lp = q->lowest_prio;
+ if (prio <= lp) {
+ q->qstats[prio].drops++;
+ q->qstats[prio].overlimits++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Drop the packet at the tail of the lowest priority qdisc. */
+ lp_qdisc = &q->qdiscs[lp];
+ to_drop = __skb_dequeue_tail(lp_qdisc);
+ BUG_ON(!to_drop);
+ qdisc_qstats_backlog_dec(sch, to_drop);
+ qdisc_drop(to_drop, sch, to_free);
+
+ q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
+ q->qstats[lp].drops++;
+ q->qstats[lp].overlimits++;
+
+ /* Check to update highest and lowest priorities. */
+ if (skb_queue_empty(lp_qdisc)) {
+ if (q->lowest_prio == q->highest_prio) {
+ /* The incoming packet is the only packet in queue. */
+ BUG_ON(sch->q.qlen != 1);
+ q->lowest_prio = prio;
+ q->highest_prio = prio;
+ } else {
+ q->lowest_prio = calc_new_low_prio(q);
+ }
+ }
+
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+ struct sk_buff *skb = __skb_dequeue(hpq);
+
+ if (unlikely(!skb))
+ return NULL;
+
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+
+ q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
+
+ /* Update highest priority field. */
+ if (skb_queue_empty(hpq)) {
+ if (q->lowest_prio == q->highest_prio) {
+ BUG_ON(sch->q.qlen);
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ } else {
+ q->highest_prio = calc_new_high_prio(q);
+ }
+ }
+ return skb;
+}
+
+static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_skbprio_qopt *ctl = nla_data(opt);
+
+ if (opt->nla_len != nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+
+ sch->limit = ctl->limit;
+ return 0;
+}
+
+static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ /* Initialise all queues, one for each possible priority. */
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_head_init(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ sch->limit = 64;
+ if (!opt)
+ return 0;
+
+ return skbprio_change(sch, opt, extack);
+}
+
+static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_skbprio_qopt opt;
+
+ opt.limit = sch->limit;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ return -1;
+
+ return skb->len;
+}
+
+static void skbprio_reset(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static void skbprio_destroy(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+}
+
+static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
+ q->qstats[cl - 1].qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops skbprio_class_ops = {
+ .leaf = skbprio_leaf,
+ .find = skbprio_find,
+ .dump = skbprio_dump_class,
+ .dump_stats = skbprio_dump_class_stats,
+ .walk = skbprio_walk,
+};
+
+static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
+ .cl_ops = &skbprio_class_ops,
+ .id = "skbprio",
+ .priv_size = sizeof(struct skbprio_sched_data),
+ .enqueue = skbprio_enqueue,
+ .dequeue = skbprio_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = skbprio_init,
+ .reset = skbprio_reset,
+ .change = skbprio_change,
+ .dump = skbprio_dump,
+ .destroy = skbprio_destroy,
+ .owner = THIS_MODULE,
+};
+
+static int __init skbprio_module_init(void)
+{
+ return register_qdisc(&skbprio_qdisc_ops);
+}
+
+static void __exit skbprio_module_exit(void)
+{
+ unregister_qdisc(&skbprio_qdisc_ops);
+}
+
+module_init(skbprio_module_init)
+module_exit(skbprio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000..dd29de141
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,563 @@
+/*
+ * net/sched/sch_tbf.c Token Bucket Filter queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
+ * original idea by Martin Devera
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+
+/* Simple Token Bucket Filter.
+ =======================================
+
+ SOURCE.
+ -------
+
+ None.
+
+ Description.
+ ------------
+
+ A data flow obeys TBF with rate R and depth B, if for any
+ time interval t_i...t_f the number of transmitted bits
+ does not exceed B + R*(t_f-t_i).
+
+ Packetized version of this definition:
+ The sequence of packets of sizes s_i served at moments t_i
+ obeys TBF, if for any i<=k:
+
+ s_i+....+s_k <= B + R*(t_k - t_i)
+
+ Algorithm.
+ ----------
+
+ Let N(t_i) be B/R initially and N(t) grow continuously with time as:
+
+ N(t+delta) = min{B/R, N(t) + delta}
+
+ If the first packet in queue has length S, it may be
+ transmitted only at the time t_* when S/R <= N(t_*),
+ and in this case N(t) jumps:
+
+ N(t_* + 0) = N(t_* - 0) - S/R.
+
+
+
+ Actually, QoS requires two TBF to be applied to a data stream.
+ One of them controls steady state burst size, another
+ one with rate P (peak rate) and depth M (equal to link MTU)
+ limits bursts at a smaller time scale.
+
+ It is easy to see that P>R, and B>M. If P is infinity, this double
+ TBF is equivalent to a single one.
+
+ When TBF works in reshaping mode, latency is estimated as:
+
+ lat = max ((L-B)/R, (L-M)/P)
+
+
+ NOTES.
+ ------
+
+ If TBF throttles, it starts a watchdog timer, which will wake it up
+ when it is ready to transmit.
+ Note that the minimal timer resolution is 1/HZ.
+ If no new packets arrive during this period,
+ or if the device is not awaken by EOI for some previous packet,
+ TBF can stop its activity for 1/HZ.
+
+
+ This means, that with depth B, the maximal rate is
+
+ R_crit = B*HZ
+
+ F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
+
+ Note that the peak rate TBF is much more tough: with MTU 1500
+ P_crit = 150Kbytes/sec. So, if you need greater peak
+ rates, use alpha with HZ=1000 :-)
+
+ With classful TBF, limit is just kept for backwards compatibility.
+ It is passed to the default bfifo qdisc - if the inner qdisc is
+ changed the limit is not effective anymore.
+*/
+
+struct tbf_sched_data {
+/* Parameters */
+ u32 limit; /* Maximal length of backlog: bytes */
+ u32 max_size;
+ s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
+ s64 mtu;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
+
+/* Variables */
+ s64 tokens; /* Current number of B tokens */
+ s64 ptokens; /* Current number of P tokens */
+ s64 t_c; /* Time check-point */
+ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
+ struct qdisc_watchdog watchdog; /* Watchdog timer */
+};
+
+
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static u64 psched_ns_t2l(const struct psched_ratecfg *r,
+ u64 time_in_ns)
+{
+ /* The formula is :
+ * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
+ */
+ u64 len = time_in_ns * r->rate_bytes_ps;
+
+ do_div(len, NSEC_PER_SEC);
+
+ if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
+ do_div(len, 53);
+ len = len * 48;
+ }
+
+ if (len > r->overhead)
+ len -= r->overhead;
+ else
+ len = 0;
+
+ return len;
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ len += segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc, to_free);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ int ret;
+
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb) &&
+ skb_gso_validate_mac_len(skb, q->max_size))
+ return tbf_segment(skb, sch, to_free);
+ return qdisc_drop(skb, sch, to_free);
+ }
+ ret = qdisc_enqueue(skb, q->qdisc, to_free);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+}
+
+static bool tbf_peak_present(const struct tbf_sched_data *q)
+{
+ return q->peak.rate_bytes_ps;
+}
+
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ skb = q->qdisc->ops->peek(q->qdisc);
+
+ if (skb) {
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
+ unsigned int len = qdisc_pkt_len(skb);
+
+ now = ktime_get_ns();
+ toks = min_t(s64, now - q->t_c, q->buffer);
+
+ if (tbf_peak_present(q)) {
+ ptoks = toks + q->ptokens;
+ if (ptoks > q->mtu)
+ ptoks = q->mtu;
+ ptoks -= (s64) psched_l2t_ns(&q->peak, len);
+ }
+ toks += q->tokens;
+ if (toks > q->buffer)
+ toks = q->buffer;
+ toks -= (s64) psched_l2t_ns(&q->rate, len);
+
+ if ((toks|ptoks) >= 0) {
+ skb = qdisc_dequeue_peeked(q->qdisc);
+ if (unlikely(!skb))
+ return NULL;
+
+ q->t_c = now;
+ q->tokens = toks;
+ q->ptokens = ptoks;
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ now + max_t(long, -toks, -ptoks));
+
+ /* Maybe we have a shorter packet in the queue,
+ which can be sent now. It sounds cool,
+ but, however, this is wrong in principle.
+ We MUST NOT reorder packets under these circumstances.
+
+ Really, if we split the flow into independent
+ subflows, it would be a very good solution.
+ This is the main idea of all FQ algorithms
+ (cf. CSZ, HPFQ, HFSC)
+ */
+
+ qdisc_qstats_overlimit(sch);
+ }
+ return NULL;
+}
+
+static void tbf_reset(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+ q->t_c = ktime_get_ns();
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
+ [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
+ [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_RATE64] = { .type = NLA_U64 },
+ [TCA_TBF_PRATE64] = { .type = NLA_U64 },
+ [TCA_TBF_BURST] = { .type = NLA_U32 },
+ [TCA_TBF_PBURST] = { .type = NLA_U32 },
+};
+
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_TBF_MAX + 1];
+ struct tc_tbf_qopt *qopt;
+ struct Qdisc *child = NULL;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
+ u64 max_size;
+ s64 buffer, mtu;
+ u64 rate64 = 0, prate64 = 0;
+
+ err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (tb[TCA_TBF_PARMS] == NULL)
+ goto done;
+
+ qopt = nla_data(tb[TCA_TBF_PARMS]);
+ if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
+ tb[TCA_TBF_RTAB],
+ NULL));
+
+ if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
+ tb[TCA_TBF_PTAB],
+ NULL));
+
+ buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
+ mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
+
+ if (tb[TCA_TBF_RATE64])
+ rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+ psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
+
+ if (tb[TCA_TBF_BURST]) {
+ max_size = nla_get_u32(tb[TCA_TBF_BURST]);
+ buffer = psched_l2t_ns(&rate, max_size);
+ } else {
+ max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
+ }
+
+ if (qopt->peakrate.rate) {
+ if (tb[TCA_TBF_PRATE64])
+ prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+ psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
+ if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
+ pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
+ peak.rate_bytes_ps, rate.rate_bytes_ps);
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (tb[TCA_TBF_PBURST]) {
+ u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
+ max_size = min_t(u32, max_size, pburst);
+ mtu = psched_l2t_ns(&peak, pburst);
+ } else {
+ max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
+ }
+ } else {
+ memset(&peak, 0, sizeof(peak));
+ }
+
+ if (max_size < psched_mtu(qdisc_dev(sch)))
+ pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
+ max_size, qdisc_dev(sch)->name,
+ psched_mtu(qdisc_dev(sch)));
+
+ if (!max_size) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (q->qdisc != &noop_qdisc) {
+ err = fifo_set_limit(q->qdisc, qopt->limit);
+ if (err)
+ goto done;
+ } else if (qopt->limit > 0) {
+ child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
+ extack);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto done;
+ }
+
+ /* child is fifo, no need to check for noop_qdisc */
+ qdisc_hash_add(child, true);
+ }
+
+ sch_tree_lock(sch);
+ if (child) {
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
+ qdisc_put(q->qdisc);
+ q->qdisc = child;
+ }
+ q->limit = qopt->limit;
+ if (tb[TCA_TBF_PBURST])
+ q->mtu = mtu;
+ else
+ q->mtu = PSCHED_TICKS2NS(qopt->mtu);
+ q->max_size = max_size;
+ if (tb[TCA_TBF_BURST])
+ q->buffer = buffer;
+ else
+ q->buffer = PSCHED_TICKS2NS(qopt->buffer);
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
+
+ memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
+ memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
+
+ sch_tree_unlock(sch);
+ err = 0;
+done:
+ return err;
+}
+
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+ q->qdisc = &noop_qdisc;
+
+ if (!opt)
+ return -EINVAL;
+
+ q->t_c = ktime_get_ns();
+
+ return tbf_change(sch, opt, extack);
+}
+
+static void tbf_destroy(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ qdisc_put(q->qdisc);
+}
+
+static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nest;
+ struct tc_tbf_qopt opt;
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ opt.limit = q->limit;
+ psched_ratecfg_getrate(&opt.rate, &q->rate);
+ if (tbf_peak_present(q))
+ psched_ratecfg_getrate(&opt.peakrate, &q->peak);
+ else
+ memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+ opt.mtu = PSCHED_NS2TICKS(q->mtu);
+ opt.buffer = PSCHED_NS2TICKS(q->buffer);
+ if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
+ TCA_TBF_PAD))
+ goto nla_put_failure;
+ if (tbf_peak_present(q) &&
+ q->peak.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
+ TCA_TBF_PAD))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops tbf_class_ops = {
+ .graft = tbf_graft,
+ .leaf = tbf_leaf,
+ .find = tbf_find,
+ .walk = tbf_walk,
+ .dump = tbf_dump_class,
+};
+
+static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &tbf_class_ops,
+ .id = "tbf",
+ .priv_size = sizeof(struct tbf_sched_data),
+ .enqueue = tbf_enqueue,
+ .dequeue = tbf_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = tbf_init,
+ .reset = tbf_reset,
+ .destroy = tbf_destroy,
+ .change = tbf_change,
+ .dump = tbf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init tbf_module_init(void)
+{
+ return register_qdisc(&tbf_qdisc_ops);
+}
+
+static void __exit tbf_module_exit(void)
+{
+ unregister_qdisc(&tbf_qdisc_ops);
+}
+module_init(tbf_module_init)
+module_exit(tbf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
new file mode 100644
index 000000000..163364124
--- /dev/null
+++ b/net/sched/sch_teql.c
@@ -0,0 +1,530 @@
+/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/moduleparam.h>
+#include <net/dst.h>
+#include <net/neighbour.h>
+#include <net/pkt_sched.h>
+
+/*
+ How to setup it.
+ ----------------
+
+ After loading this module you will find a new device teqlN
+ and new qdisc with the same name. To join a slave to the equalizer
+ you should just set this qdisc on a device f.e.
+
+ # tc qdisc add dev eth0 root teql0
+ # tc qdisc add dev eth1 root teql0
+
+ That's all. Full PnP 8)
+
+ Applicability.
+ --------------
+
+ 1. Slave devices MUST be active devices, i.e., they must raise the tbusy
+ signal and generate EOI events. If you want to equalize virtual devices
+ like tunnels, use a normal eql device.
+ 2. This device puts no limitations on physical slave characteristics
+ f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
+ Certainly, large difference in link speeds will make the resulting
+ eqalized link unusable, because of huge packet reordering.
+ I estimate an upper useful difference as ~10 times.
+ 3. If the slave requires address resolution, only protocols using
+ neighbour cache (IPv4/IPv6) will work over the equalized link.
+ Other protocols are still allowed to use the slave device directly,
+ which will not break load balancing, though native slave
+ traffic will have the highest priority. */
+
+struct teql_master {
+ struct Qdisc_ops qops;
+ struct net_device *dev;
+ struct Qdisc *slaves;
+ struct list_head master_list;
+ unsigned long tx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_errors;
+ unsigned long tx_dropped;
+};
+
+struct teql_sched_data {
+ struct Qdisc *next;
+ struct teql_master *m;
+ struct sk_buff_head q;
+};
+
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
+
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
+
+/* "teql*" qdisc routines */
+
+static int
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct teql_sched_data *q = qdisc_priv(sch);
+
+ if (q->q.qlen < dev->tx_queue_len) {
+ __skb_queue_tail(&q->q, skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_drop(skb, sch, to_free);
+}
+
+static struct sk_buff *
+teql_dequeue(struct Qdisc *sch)
+{
+ struct teql_sched_data *dat = qdisc_priv(sch);
+ struct netdev_queue *dat_queue;
+ struct sk_buff *skb;
+ struct Qdisc *q;
+
+ skb = __skb_dequeue(&dat->q);
+ dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
+ q = rcu_dereference_bh(dat_queue->qdisc);
+
+ if (skb == NULL) {
+ struct net_device *m = qdisc_dev(q);
+ if (m) {
+ dat->m->slaves = sch;
+ netif_wake_queue(m);
+ }
+ } else {
+ qdisc_bstats_update(sch, skb);
+ }
+ sch->q.qlen = dat->q.qlen + q->q.qlen;
+ return skb;
+}
+
+static struct sk_buff *
+teql_peek(struct Qdisc *sch)
+{
+ /* teql is meant to be used as root qdisc */
+ return NULL;
+}
+
+static void
+teql_reset(struct Qdisc *sch)
+{
+ struct teql_sched_data *dat = qdisc_priv(sch);
+
+ skb_queue_purge(&dat->q);
+ sch->q.qlen = 0;
+}
+
+static void
+teql_destroy(struct Qdisc *sch)
+{
+ struct Qdisc *q, *prev;
+ struct teql_sched_data *dat = qdisc_priv(sch);
+ struct teql_master *master = dat->m;
+
+ if (!master)
+ return;
+
+ prev = master->slaves;
+ if (prev) {
+ do {
+ q = NEXT_SLAVE(prev);
+ if (q == sch) {
+ NEXT_SLAVE(prev) = NEXT_SLAVE(q);
+ if (q == master->slaves) {
+ master->slaves = NEXT_SLAVE(q);
+ if (q == master->slaves) {
+ struct netdev_queue *txq;
+ spinlock_t *root_lock;
+
+ txq = netdev_get_tx_queue(master->dev, 0);
+ master->slaves = NULL;
+
+ root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
+ spin_lock_bh(root_lock);
+ qdisc_reset(rtnl_dereference(txq->qdisc));
+ spin_unlock_bh(root_lock);
+ }
+ }
+ skb_queue_purge(&dat->q);
+ break;
+ }
+
+ } while ((prev = q) != master->slaves);
+ }
+}
+
+static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct teql_master *m = (struct teql_master *)sch->ops;
+ struct teql_sched_data *q = qdisc_priv(sch);
+
+ if (dev->hard_header_len > m->dev->hard_header_len)
+ return -EINVAL;
+
+ if (m->dev == dev)
+ return -ELOOP;
+
+ q->m = m;
+
+ skb_queue_head_init(&q->q);
+
+ if (m->slaves) {
+ if (m->dev->flags & IFF_UP) {
+ if ((m->dev->flags & IFF_POINTOPOINT &&
+ !(dev->flags & IFF_POINTOPOINT)) ||
+ (m->dev->flags & IFF_BROADCAST &&
+ !(dev->flags & IFF_BROADCAST)) ||
+ (m->dev->flags & IFF_MULTICAST &&
+ !(dev->flags & IFF_MULTICAST)) ||
+ dev->mtu < m->dev->mtu)
+ return -EINVAL;
+ } else {
+ if (!(dev->flags&IFF_POINTOPOINT))
+ m->dev->flags &= ~IFF_POINTOPOINT;
+ if (!(dev->flags&IFF_BROADCAST))
+ m->dev->flags &= ~IFF_BROADCAST;
+ if (!(dev->flags&IFF_MULTICAST))
+ m->dev->flags &= ~IFF_MULTICAST;
+ if (dev->mtu < m->dev->mtu)
+ m->dev->mtu = dev->mtu;
+ }
+ q->next = NEXT_SLAVE(m->slaves);
+ NEXT_SLAVE(m->slaves) = sch;
+ } else {
+ q->next = sch;
+ m->slaves = sch;
+ m->dev->mtu = dev->mtu;
+ m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
+ }
+ return 0;
+}
+
+
+static int
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+ struct net_device *dev, struct netdev_queue *txq,
+ struct dst_entry *dst)
+{
+ struct neighbour *n;
+ int err = 0;
+
+ n = dst_neigh_lookup_skb(dst, skb);
+ if (!n)
+ return -ENOENT;
+
+ if (dst->dev != dev) {
+ struct neighbour *mn;
+
+ mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
+ neigh_release(n);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ n = mn;
+ }
+
+ if (neigh_event_send(n, skb_res) == 0) {
+ int err;
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, n, dev);
+ err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)),
+ haddr, NULL, skb->len);
+
+ if (err < 0)
+ err = -EINVAL;
+ } else {
+ err = (skb_res == NULL) ? -EAGAIN : 1;
+ }
+ neigh_release(n);
+ return err;
+}
+
+static inline int teql_resolve(struct sk_buff *skb,
+ struct sk_buff *skb_res,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ int res;
+
+ if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
+ return -ENODEV;
+
+ if (!dev->header_ops || !dst)
+ return 0;
+
+ rcu_read_lock();
+ res = __teql_resolve(skb, skb_res, dev, txq, dst);
+ rcu_read_unlock();
+
+ return res;
+}
+
+static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct teql_master *master = netdev_priv(dev);
+ struct Qdisc *start, *q;
+ int busy;
+ int nores;
+ int subq = skb_get_queue_mapping(skb);
+ struct sk_buff *skb_res = NULL;
+
+ start = master->slaves;
+
+restart:
+ nores = 0;
+ busy = 0;
+
+ q = start;
+ if (!q)
+ goto drop;
+
+ do {
+ struct net_device *slave = qdisc_dev(q);
+ struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
+
+ if (slave_txq->qdisc_sleeping != q)
+ continue;
+ if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
+ !netif_running(slave)) {
+ busy = 1;
+ continue;
+ }
+
+ switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
+ case 0:
+ if (__netif_tx_trylock(slave_txq)) {
+ unsigned int length = qdisc_pkt_len(skb);
+
+ if (!netif_xmit_frozen_or_stopped(slave_txq) &&
+ netdev_start_xmit(skb, slave, slave_txq, false) ==
+ NETDEV_TX_OK) {
+ __netif_tx_unlock(slave_txq);
+ master->slaves = NEXT_SLAVE(q);
+ netif_wake_queue(dev);
+ master->tx_packets++;
+ master->tx_bytes += length;
+ return NETDEV_TX_OK;
+ }
+ __netif_tx_unlock(slave_txq);
+ }
+ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
+ busy = 1;
+ break;
+ case 1:
+ master->slaves = NEXT_SLAVE(q);
+ return NETDEV_TX_OK;
+ default:
+ nores = 1;
+ break;
+ }
+ __skb_pull(skb, skb_network_offset(skb));
+ } while ((q = NEXT_SLAVE(q)) != start);
+
+ if (nores && skb_res == NULL) {
+ skb_res = skb;
+ goto restart;
+ }
+
+ if (busy) {
+ netif_stop_queue(dev);
+ return NETDEV_TX_BUSY;
+ }
+ master->tx_errors++;
+
+drop:
+ master->tx_dropped++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int teql_master_open(struct net_device *dev)
+{
+ struct Qdisc *q;
+ struct teql_master *m = netdev_priv(dev);
+ int mtu = 0xFFFE;
+ unsigned int flags = IFF_NOARP | IFF_MULTICAST;
+
+ if (m->slaves == NULL)
+ return -EUNATCH;
+
+ flags = FMASK;
+
+ q = m->slaves;
+ do {
+ struct net_device *slave = qdisc_dev(q);
+
+ if (slave == NULL)
+ return -EUNATCH;
+
+ if (slave->mtu < mtu)
+ mtu = slave->mtu;
+ if (slave->hard_header_len > LL_MAX_HEADER)
+ return -EINVAL;
+
+ /* If all the slaves are BROADCAST, master is BROADCAST
+ If all the slaves are PtP, master is PtP
+ Otherwise, master is NBMA.
+ */
+ if (!(slave->flags&IFF_POINTOPOINT))
+ flags &= ~IFF_POINTOPOINT;
+ if (!(slave->flags&IFF_BROADCAST))
+ flags &= ~IFF_BROADCAST;
+ if (!(slave->flags&IFF_MULTICAST))
+ flags &= ~IFF_MULTICAST;
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
+
+ m->dev->mtu = mtu;
+ m->dev->flags = (m->dev->flags&~FMASK) | flags;
+ netif_start_queue(m->dev);
+ return 0;
+}
+
+static int teql_master_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static void teql_master_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct teql_master *m = netdev_priv(dev);
+
+ stats->tx_packets = m->tx_packets;
+ stats->tx_bytes = m->tx_bytes;
+ stats->tx_errors = m->tx_errors;
+ stats->tx_dropped = m->tx_dropped;
+}
+
+static int teql_master_mtu(struct net_device *dev, int new_mtu)
+{
+ struct teql_master *m = netdev_priv(dev);
+ struct Qdisc *q;
+
+ q = m->slaves;
+ if (q) {
+ do {
+ if (new_mtu > qdisc_dev(q)->mtu)
+ return -EINVAL;
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
+ }
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops teql_netdev_ops = {
+ .ndo_open = teql_master_open,
+ .ndo_stop = teql_master_close,
+ .ndo_start_xmit = teql_master_xmit,
+ .ndo_get_stats64 = teql_master_stats64,
+ .ndo_change_mtu = teql_master_mtu,
+};
+
+static __init void teql_master_setup(struct net_device *dev)
+{
+ struct teql_master *master = netdev_priv(dev);
+ struct Qdisc_ops *ops = &master->qops;
+
+ master->dev = dev;
+ ops->priv_size = sizeof(struct teql_sched_data);
+
+ ops->enqueue = teql_enqueue;
+ ops->dequeue = teql_dequeue;
+ ops->peek = teql_peek;
+ ops->init = teql_qdisc_init;
+ ops->reset = teql_reset;
+ ops->destroy = teql_destroy;
+ ops->owner = THIS_MODULE;
+
+ dev->netdev_ops = &teql_netdev_ops;
+ dev->type = ARPHRD_VOID;
+ dev->mtu = 1500;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65535;
+ dev->tx_queue_len = 100;
+ dev->flags = IFF_NOARP;
+ dev->hard_header_len = LL_MAX_HEADER;
+ netif_keep_dst(dev);
+}
+
+static LIST_HEAD(master_dev_list);
+static int max_equalizers = 1;
+module_param(max_equalizers, int, 0);
+MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
+
+static int __init teql_init(void)
+{
+ int i;
+ int err = -ENODEV;
+
+ for (i = 0; i < max_equalizers; i++) {
+ struct net_device *dev;
+ struct teql_master *master;
+
+ dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
+ NET_NAME_UNKNOWN, teql_master_setup);
+ if (!dev) {
+ err = -ENOMEM;
+ break;
+ }
+
+ if ((err = register_netdev(dev))) {
+ free_netdev(dev);
+ break;
+ }
+
+ master = netdev_priv(dev);
+
+ strlcpy(master->qops.id, dev->name, IFNAMSIZ);
+ err = register_qdisc(&master->qops);
+
+ if (err) {
+ unregister_netdev(dev);
+ free_netdev(dev);
+ break;
+ }
+
+ list_add_tail(&master->master_list, &master_dev_list);
+ }
+ return i ? 0 : err;
+}
+
+static void __exit teql_exit(void)
+{
+ struct teql_master *master, *nxt;
+
+ list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
+
+ list_del(&master->master_list);
+
+ unregister_qdisc(&master->qops);
+ unregister_netdev(master->dev);
+ free_netdev(master->dev);
+ }
+}
+
+module_init(teql_init);
+module_exit(teql_exit);
+
+MODULE_LICENSE("GPL");