diff options
Diffstat (limited to 'man')
140 files changed, 30924 insertions, 0 deletions
diff --git a/man/Makefile b/man/Makefile new file mode 100644 index 0000000..c0b0d41 --- /dev/null +++ b/man/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +INSTALL=install +INSTALLDIR=install -m 0755 -d +INSTALLMAN=install -m 0644 +# Pass the same parameters as Lintian uses on Debian. +MAN_CHECK=LC_ALL=en_US.UTF-8 MANROFFSEQ='' MANWIDTH=100 man --warnings \ + --encoding=UTF-8 --local-file --troff-device=utf8 --ditroff +# Hide man output, count and print errors. +MAN_REDIRECT=2>&1 >/dev/null | tee /dev/fd/2 | wc -l + +SUBDIRS = man3 man7 man8 + +all clean install check: + @for subdir in $(SUBDIRS); do $(MAKE) -C $$subdir $@ || exit $$?; done + +distclean: clean + +.PHONY: install clean distclean check + +.EXPORT_ALL_VARIABLES: diff --git a/man/man3/Makefile b/man/man3/Makefile new file mode 100644 index 0000000..1732be2 --- /dev/null +++ b/man/man3/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +MAN3PAGES = $(wildcard *.3) + +all: + +distclean: clean + +clean: + +install: + $(INSTALLDIR) $(DESTDIR)$(MANDIR)/man3 + $(INSTALLMAN) $(MAN3PAGES) $(DESTDIR)$(MANDIR)/man3 + +check: + @for page in $(MAN3PAGES); do test 0 -eq $$($(MAN_CHECK) $$page \ + $(MAN_REDIRECT)) || { echo "Error in $$page"; exit 1; }; done + +.PHONY: install clean distclean check diff --git a/man/man3/libnetlink.3 b/man/man3/libnetlink.3 new file mode 100644 index 0000000..9a2c801 --- /dev/null +++ b/man/man3/libnetlink.3 @@ -0,0 +1,200 @@ +.TH libnetlink 3 +.SH NAME +libnetlink \- A library for accessing the netlink service +.SH SYNOPSIS +.nf +#include <asm/types.h> +.br +#include <libnetlink.h> +.br +#include <linux/netlink.h> +.br +#include <linux/rtnetlink.h> +.sp +int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions) +.sp +int rtnl_wilddump_request(struct rtnl_handle *rth, int family, int type) +.sp +int rtnl_send(struct rtnl_handle *rth, char *buf, int len) +.sp +int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len) +.sp +int rtnl_dump_filter(struct rtnl_handle *rth, + int (*filter)(struct sockaddr_nl *, struct nlmsghdr *n, void *), + void *arg1, + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *arg2) +.sp +int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer, + unsigned groups, struct nlmsghdr *answer, +.br + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), +.br + void *jarg) +.sp +int rtnl_listen(struct rtnl_handle *rtnl, + int (*handler)(struct sockaddr_nl *, struct rtnl_ctrl_data *, + struct nlmsghdr *n, void *), + void *jarg) +.sp +int rtnl_from_file(FILE *rtnl, + int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg) +.sp +int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data) +.sp +int addattr_l(struct nlmsghdr *n, int maxlen, int type, void *data, int alen) +.sp +int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data) +.sp +int rta_addattr_l(struct rtattr *rta, int maxlen, int type, void *data, int alen) +.SH DESCRIPTION +libnetlink provides a higher level interface to +.BR rtnetlink (7). +The read functions return 0 on success and a negative errno on failure. +The send functions return the amount of data sent, or -1 on error. +.TP +rtnl_open +Open a rtnetlink socket and save the state into the +.B rth +handle. This handle is passed to all subsequent calls. +.B subscriptions +is a bitmap of the rtnetlink multicast groups the socket will be +a member of. + +.TP +rtnl_wilddump_request +Request a full dump of the +.B type +database for +.B family +addresses. +.B type +is a rtnetlink message type. +.\" XXX + +.TP +rtnl_dump_request +Request a full dump of the +.B type +data buffer into +.B buf +with maximum length of +.B len. +.B type +is a rtnetlink message type. + +.TP +rtnl_dump_filter +Receive netlink data after a request and filter it. +The +.B filter +callback checks if the received message is wanted. It gets the source +address of the message, the message itself and +.B arg1 +as arguments. 0 as return means that the filter passed, a negative +value is returned +by +.I rtnl_dump_filter +in case of error. NULL for +.I filter +means to not use a filter. +.B junk +is used to filter messages not destined to the local socket. +Only one message bundle is received. If there is a message +pending, this function does not block. + +.TP +rtnl_listen +Receive netlink data after a request and pass it to +.I handler. +.B handler +is a callback that gets the message source address, anscillary data, the message +itself, and the +.B jarg +cookie as arguments. It will get called for all received messages. +Only one message bundle is received. If there is a message +pending this function does not block. + +.TP +rtnl_from_file +Works like +.I rtnl_listen, +but reads a netlink message bundle from the file +.B file +and passes the messages to +.B handler +for parsing. The file should contain raw data as received from a rtnetlink socket. +.PP +The following functions are useful to construct custom rtnetlink messages. For +simple database dumping with filtering it is better to use the higher level +functions above. See +.BR rtnetlink (3) +and +.BR netlink (3) +on how to generate a rtnetlink message. The following utility functions +require a continuous buffer that already contains a netlink message header +and a rtnetlink request. + +.TP +rtnl_send +Send the rtnetlink message in +.B buf +of length +.B len +to handle +.B rth. + +.TP +addattr32 +Add a __u32 attribute of type +.B type +and with value +.B data +to netlink message +.B n, +which is part of a buffer of length +.B maxlen. + +.TP +addattr_l +Add a variable length attribute of type +.B type +and with value +.B data +and +.B alen +length to netlink message +.B n, +which is part of a buffer of length +.B maxlen. +.B data +is copied. + +.TP +rta_addattr32 +Initialize the rtnetlink attribute +.B rta +with a __u32 data value. + +.TP +rta_addattr32 +Initialize the rtnetlink attribute +.B rta +with a variable length data value. + +.SH BUGS +This library is meant for internal use, use libmnl for new programs. + +The functions sometimes use fprintf and exit when a fatal error occurs. +This library should be named librtnetlink. + +.SH AUTHORS +netlink/rtnetlink was designed and written by Alexey Kuznetsov. +Andi Kleen wrote the man page. + +.SH SEE ALSO +.BR netlink (7), +.BR rtnetlink (7) +.br +/usr/include/linux/rtnetlink.h diff --git a/man/man7/Makefile b/man/man7/Makefile new file mode 100644 index 0000000..c0e545a --- /dev/null +++ b/man/man7/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +MAN7PAGES = $(wildcard *.7) + +all: + +distclean: clean + +clean: + +install: + $(INSTALLDIR) $(DESTDIR)$(MANDIR)/man7 + $(INSTALLMAN) $(MAN7PAGES) $(DESTDIR)$(MANDIR)/man7 + +check: + @for page in $(MAN7PAGES); do test 0 -eq $$($(MAN_CHECK) $$page \ + $(MAN_REDIRECT)) || { echo "Error in $$page"; exit 1; }; done + +.PHONY: install clean distclean check diff --git a/man/man7/tc-hfsc.7 b/man/man7/tc-hfsc.7 new file mode 100644 index 0000000..72fb5f2 --- /dev/null +++ b/man/man7/tc-hfsc.7 @@ -0,0 +1,563 @@ +.TH "TC\-HFSC" 7 "31 October 2011" iproute2 Linux +.SH "NAME" +tc-hfcs \- Hierarchical Fair Service Curve +. +.SH "HISTORY & INTRODUCTION" +. +HFSC (Hierarchical Fair Service Curve) is a network packet scheduling algorithm that was first presented at +SIGCOMM'97. Developed as a part of ALTQ (ALTernative Queuing) on NetBSD, found +its way quickly to other BSD systems, and then a few years ago became part of +the linux kernel. Still, it's not the most popular scheduling algorithm \- +especially if compared to HTB \- and it's not well documented for the enduser. This introduction aims to explain how HFSC works without using +too much math (although some math it will be +inevitable). + +In short HFSC aims to: +. +.RS 4 +.IP \fB1)\fR 4 +guarantee precise bandwidth and delay allocation for all leaf classes (realtime +criterion) +.IP \fB2)\fR +allocate excess bandwidth fairly as specified by class hierarchy (linkshare & +upperlimit criterion) +.IP \fB3)\fR +minimize any discrepancy between the service curve and the actual amount of +service provided during linksharing +.RE +.PP +. +The main "selling" point of HFSC is feature \fB(1)\fR, which is achieved by +using nonlinear service curves (more about what it actually is later). This is +particularly useful in VoIP or games, where not only a guarantee of consistent +bandwidth is important, but also limiting the initial delay of a data stream. Note that +it matters only for leaf classes (where the actual queues are) \- thus class +hierarchy is ignored in the realtime case. + +Feature \fB(2)\fR is well, obvious \- any algorithm featuring class hierarchy +(such as HTB) strives to achieve that. HFSC does that well, although +you might end with unusual situations, if you define service curves carelessly +\- see section CORNER CASES for examples. + +Feature \fB(3)\fR is mentioned due to the nature of the problem. There may be +situations where it's either not possible to guarantee service of all curves at +the same time, and/or it's impossible to do so fairly. Both will be explained +later. Note that this is mainly related to interior (aka aggregate) classes, as +the leafs are already handled by \fB(1)\fR. Still, it's perfectly possible to +create a leaf class without realtime service, and in such a case the caveats will +naturally extend to leaf classes as well. + +.SH ABBREVIATIONS +For the remaining part of the document, we'll use following shortcuts: +.nf +.RS 4 + +RT \- realtime +LS \- linkshare +UL \- upperlimit +SC \- service curve +.RE +.fi +. +.SH "BASICS OF HFSC" +. +To understand how HFSC works, we must first introduce a service curve. +Overall, it's a nondecreasing function of some time unit, returning the amount +of +service (an allowed or allocated amount of bandwidth) at some specific point in +time. The purpose of it should be subconsciously obvious: if a class was +allowed to transfer not less than the amount specified by its service curve, +then the service curve is not violated. + +Still, we need more elaborate criterion than just the above (although in +the most generic case it can be reduced to it). The criterion has to take two +things into account: +. +.RS 4 +.IP \(bu 4 +idling periods +.IP \(bu +the ability to "look back", so if during current active period the service curve is violated, maybe it +isn't if we count excess bandwidth received during earlier active period(s) +.RE +.PP +Let's define the criterion as follows: +.RS 4 +.nf +.IP "\fB(1)\fR" 4 +For each t1, there must exist t0 in set B, so S(t1\-t0)\~<=\~w(t0,t1) +.fi +.RE +. +.PP +Here 'w' denotes the amount of service received during some time period between t0 +and t1. B is a set of all times, where a session becomes active after idling +period (further denoted as 'becoming backlogged'). For a clearer picture, +imagine two situations: +. +.RS 4 +.IP \fBa)\fR 4 +our session was active during two periods, with a small time gap between them +.IP \fBb)\fR +as in (a), but with a larger gap +.RE +. +.PP +Consider \fB(a)\fR: if the service received during both periods meets +\fB(1)\fR, then all is well. But what if it doesn't do so during the 2nd +period? If the amount of service received during the 1st period is larger +than the service curve, then it might compensate for smaller service during +the 2nd period \fIand\fR the gap \- if the gap is small enough. + +If the gap is larger \fB(b)\fR \- then it's less likely to happen (unless the +excess bandwidth allocated during the 1st part was really large). Still, the +larger the gap \- the less interesting is what happened in the past (e.g. 10 +minutes ago) \- what matters is the current traffic that just started. + +From HFSC's perspective, more interesting is answering the following question: +when should we start transferring packets, so a service curve of a class is not +violated. Or rephrasing it: How much X() amount of service should a session +receive by time t, so the service curve is not violated. Function X() defined +as below is the basic building block of HFSC, used in: eligible, deadline, +virtual\-time and fit\-time curves. Of course, X() is based on equation +\fB(1)\fR and is defined recursively: + +.RS 4 +.IP \(bu 4 +At the 1st backlogged period beginning function X is initialized to generic +service curve assigned to a class +.IP \(bu +At any subsequent backlogged period, X() is: +.nf +\fBmin(X() from previous period ; w(t0)+S(t\-t0) for t>=t0),\fR +.fi +\&... where t0 denotes the beginning of the current backlogged period. +.RE +. +.PP +HFSC uses either linear, or two\-piece linear service curves. In case of +linear or two\-piece linear convex functions (first slope < second slope), +min() in X's definition reduces to the 2nd argument. But in case of two\-piece +concave functions, the 1st argument might quickly become lesser for some +t>=t0. Note, that for some backlogged period, X() is defined only from that +period's beginning. We also define X^(\-1)(w) as smallest t>=t0, for which +X(t)\~=\~w. We have to define it this way, as X() is usually not an injection. + +The above generic X() can be one of the following: +. +.RS 4 +.IP "E()" 4 +In realtime criterion, selects packets eligible for sending. If none are +eligible, HFSC will use linkshare criterion. Eligible time \&'et' is calculated +with reference to packets' heads ( et\~=\~E^(\-1)(w) ). It's based on RT +service curve, \fIbut in case of a convex curve, uses its 2nd slope only.\fR +.IP "D()" +In realtime criterion, selects the most suitable packet from the ones chosen +by E(). Deadline time \&'dt' corresponds to packets' tails +(dt\~=\~D^(\-1)(w+l), where \&'l' is packet's length). Based on RT service +curve. +.IP "V()" +In linkshare criterion, arbitrates which packet to send next. Note that V() is +function of a virtual time \- see \fBLINKSHARE CRITERION\fR section for +details. Virtual time \&'vt' corresponds to packets' heads +(vt\~=\~V^(\-1)(w)). Based on LS service curve. +.IP "F()" +An extension to linkshare criterion, used to limit at which speed linkshare +criterion is allowed to dequeue. Fit\-time 'ft' corresponds to packets' heads +as well (ft\~=\~F^(\-1)(w)). Based on UL service curve. +.RE + +Be sure to make clean distinction between session's RT, LS and UL service +curves and the above "utility" functions. +. +.SH "REALTIME CRITERION" +. +RT criterion \fIignores class hierarchy\fR and guarantees precise bandwidth and +delay allocation. We say that a packet is eligible for sending, when the +current real +time is later than the eligible time of the packet. From all eligible packets, the one most +suited for sending is the one with the shortest deadline time. This sounds +simple, but consider the following example: + +Interface 10Mbit, two classes, both with two\-piece linear service curves: +.RS 4 +.IP \(bu 4 +1st class \- 2Mbit for 100ms, then 7Mbit (convex \- 1st slope < 2nd slope) +.IP \(bu +2nd class \- 7Mbit for 100ms, then 2Mbit (concave \- 1st slope > 2nd slope) +.RE +.PP +Assume for a moment, that we only use D() for both finding eligible packets, +and choosing the most fitting one, thus eligible time would be computed as +D^(\-1)(w) and deadline time would be computed as D^(\-1)(w+l). If the 2nd +class starts sending packets 1 second after the 1st class, it's of course +impossible to guarantee 14Mbit, as the interface capability is only 10Mbit. +The only workaround in this scenario is to allow the 1st class to send the +packets earlier that would normally be allowed. That's where separate E() comes +to help. Putting all the math aside (see HFSC paper for details), E() for RT +concave service curve is just like D(), but for the RT convex service curve \- +it's constructed using \fIonly\fR RT service curve's 2nd slope (in our example + 7Mbit). + +The effect of such E() \- packets will be sent earlier, and at the same time +D() \fIwill\fR be updated \- so the current deadline time calculated from it +will be later. Thus, when the 2nd class starts sending packets later, both +the 1st and the 2nd class will be eligible, but the 2nd session's deadline +time will be smaller and its packets will be sent first. When the 1st class +becomes idle at some later point, the 2nd class will be able to "buffer" up +again for later active period of the 1st class. + +A short remark \- in a situation, where the total amount of bandwidth +available on the interface is larger than the allocated total realtime parts +(imagine a 10 Mbit interface, but 1Mbit/2Mbit and 2Mbit/1Mbit classes), the sole +speed of the interface could suffice to guarantee the times. + +Important part of RT criterion is that apart from updating its D() and E(), +also V() used by LS criterion is updated. Generally the RT criterion is +secondary to LS one, and used \fIonly\fR if there's a risk of violating precise +realtime requirements. Still, the "participation" in bandwidth distributed by +LS criterion is there, so V() has to be updated along the way. LS criterion can +than properly compensate for non\-ideal fair sharing situation, caused by RT +scheduling. If you use UL service curve its F() will be updated as well (UL +service curve is an extension to LS one \- see \fBUPPERLIMIT CRITERION\fR +section). + +Anyway \- careless specification of LS and RT service curves can lead to +potentially undesired situations (see CORNER CASES for examples). This wasn't +the case in HFSC paper where LS and RT service curves couldn't be specified +separately. + +.SH "LINKSHARING CRITERION" +. +LS criterion's task is to distribute bandwidth according to specified class +hierarchy. Contrary to RT criterion, there're no comparisons between current +real time and virtual time \- the decision is based solely on direct comparison +of virtual times of all active subclasses \- the one with the smallest vt wins +and gets scheduled. One immediate conclusion from this fact is that absolute +values don't matter \- only ratios between them (so for example, two children +classes with simple linear 1Mbit service curves will get the same treatment +from LS criterion's perspective, as if they were 5Mbit). The other conclusion +is, that in perfectly fluid system with linear curves, all virtual times across +whole class hierarchy would be equal. + +Why is VC defined in term of virtual time (and what is it)? + +Imagine an example: class A with two children \- A1 and A2, both with let's say +10Mbit SCs. If A2 is idle, A1 receives all the bandwidth of A (and update its +V() in the process). When A2 becomes active, A1's virtual time is already +\fIfar\fR later than A2's one. Considering the type of decision made by LS +criterion, A1 would become idle for a long time. We can workaround this +situation by adjusting virtual time of the class becoming active \- we do that +by getting such time "up to date". HFSC uses a mean of the smallest and the +biggest virtual time of currently active children fit for sending. As it's not +real time anymore (excluding trivial case of situation where all classes become +active at the same time, and never become idle), it's called virtual time. + +Such approach has its price though. The problem is analogous to what was +presented in previous section and is caused by non\-linearity of service +curves: +.IP 1) 4 +either it's impossible to guarantee service curves and satisfy fairness +during certain time periods: + +.RS 4 +Recall the example from RT section, slightly modified (with 3Mbit slopes +instead of 2Mbit ones): + +.IP \(bu 4 +1st class \- 3Mbit for 100ms, then 7Mbit (convex \- 1st slope < 2nd slope) +.IP \(bu +2nd class \- 7Mbit for 100ms, then 3Mbit (concave \- 1st slope > 2nd slope) + +.PP +They sum up nicely to 10Mbit \- the interface's capacity. But if we wanted to only +use LS for guarantees and fairness \- it simply won't work. In LS context, +only V() is used for making decision which class to schedule. If the 2nd class +becomes active when the 1st one is in its second slope, the fairness will be +preserved \- ratio will be 1:1 (7Mbit:7Mbit), but LS itself is of course +unable to guarantee the absolute values themselves \- as it would have to go +beyond of what the interface is capable of. +.RE + +.IP 2) 4 +and/or it's impossible to guarantee service curves of all classes at the same +time [fairly or not]: + +.RS 4 + +This is similar to the above case, but a bit more subtle. We will consider two +subtrees, arbitrated by their common (root here) parent: + +.nf +R (root) -\ 10Mbit + +A \- 7Mbit, then 3Mbit +A1 \- 5Mbit, then 2Mbit +A2 \- 2Mbit, then 1Mbit + +B \- 3Mbit, then 7Mbit +.fi + +R arbitrates between left subtree (A) and right (B). Assume that A2 and B are +constantly backlogged, and at some later point A1 becomes backlogged (when all +other classes are in their 2nd linear part). + +What happens now? B (choice made by R) will \fIalways\fR get 7 Mbit as R is +only (obviously) concerned with the ratio between its direct children. Thus A +subtree gets 3Mbit, but its children would want (at the point when A1 became +backlogged) 5Mbit + 1Mbit. That's of course impossible, as they can only get +3Mbit due to interface limitation. + +In the left subtree \- we have the same situation as previously (fair split +between A1 and A2, but violated guarantees), but in the whole tree \- there's +no fairness (B got 7Mbit, but A1 and A2 have to fit together in 3Mbit) and +there's no guarantees for all classes (only B got what it wanted). Even if we +violated fairness in the A subtree and set A2's service curve to 0, A1 would +still not get the required bandwidth. +.RE +. +.SH "UPPERLIMIT CRITERION" +. +UL criterion is an extensions to LS one, that permits sending packets only +if current real time is later than fit\-time ('ft'). So the modified LS +criterion becomes: choose the smallest virtual time from all active children, +such that fit\-time < current real time also holds. Fit\-time is calculated +from F(), which is based on UL service curve. As you can see, its role is +kinda similar to E() used in RT criterion. Also, for obvious reasons \- you +can't specify UL service curve without LS one. + +The main purpose of the UL service curve is to limit HFSC to bandwidth available on the +upstream router (think adsl home modem/router, and linux server as +NAT/firewall/etc. with 100Mbit+ connection to mentioned modem/router). +Typically, it's used to create a single class directly under root, setting +a linear UL service curve to available bandwidth \- and then creating your class +structure from that class downwards. Of course, you're free to add a UL service +curve (linear or not) to any class with LS criterion. + +An important part about the UL service curve is that whenever at some point in time +a class doesn't qualify for linksharing due to its fit\-time, the next time it +does qualify it will update its virtual time to the smallest virtual time of +all active children fit for linksharing. This way, one of the main things the LS +criterion tries to achieve \- equality of all virtual times across whole +hierarchy \- is preserved (in perfectly fluid system with only linear curves, +all virtual times would be equal). + +Without that, 'vt' would lag behind other virtual times, and could cause +problems. Consider an interface with a capacity of 10Mbit, and the following leaf classes +(just in case you're skipping this text quickly \- this example shows behavior +that \f(BIdoesn't happen\fR): + +.nf +A \- ls 5.0Mbit +B \- ls 2.5Mbit +C \- ls 2.5Mbit, ul 2.5Mbit +.fi + +If B was idle, while A and C were constantly backlogged, A and C would normally +(as far as LS criterion is concerned) divide bandwidth in 2:1 ratio. But due +to UL service curve in place, C would get at most 2.5Mbit, and A would get the +remaining 7.5Mbit. The longer the backlogged period, the more the virtual times of +A and C would drift apart. If B became backlogged at some later point in time, +its virtual time would be set to (A's\~vt\~+\~C's\~vt)/2, thus blocking A from +sending any traffic until B's virtual time catches up with A. +. +.SH "SEPARATE LS / RT SCs" +. +Another difference from the original HFSC paper is that RT and LS SCs can be +specified separately. Moreover, leaf classes are allowed to have only either +RT SC or LS SC. For interior classes, only LS SCs make sense: any RT SC will +be ignored. +. +.SH "CORNER CASES" +. +Separate service curves for LS and RT criteria can lead to certain traps +that come from "fighting" between ideal linksharing and enforced realtime +guarantees. Those situations didn't exist in original HFSC paper, where +specifying separate LS / RT service curves was not discussed. + +Consider an interface with a 10Mbit capacity, with the following leaf classes: + +.nf +A \- ls 5.0Mbit, rt 8Mbit +B \- ls 2.5Mbit +C \- ls 2.5Mbit +.fi + +Imagine A and C are constantly backlogged. As B is idle, A and C would divide +bandwidth in 2:1 ratio, considering LS service curve (so in theory \- 6.66 and +3.33). Alas RT criterion takes priority, so A will get 8Mbit and LS will be +able to compensate class C for only 2 Mbit \- this will cause discrepancy +between virtual times of A and C. + +Assume this situation lasts for a long time with no idle periods, and +suddenly B becomes active. B's virtual time will be updated to +(A's\~vt\~+\~C's\~vt)/2, effectively landing in the middle between A's and C's +virtual time. The effect \- B, having no RT guarantees, will be punished and +will not be allowed to transfer until C's virtual time catches up. + +If the interface had a higher capacity, for example 100Mbit, this example +would behave perfectly fine though. + +Let's look a bit closer at the above example \- it "cleverly" invalidates one +of the basic things LS criterion tries to achieve \- equality of all virtual +times across class hierarchy. Leaf classes without RT service curves are +literally left to their own fate (governed by messed up virtual times). + +Also, it doesn't make much sense. Class A will always be guaranteed up to +8Mbit, and this is more than any absolute bandwidth that could happen from its +LS criterion (excluding trivial case of only A being active). If the bandwidth +taken by A is smaller than absolute value from LS criterion, the unused part +will be automatically assigned to other active classes (as A has idling periods +in such case). The only "advantage" is, that even in case of low bandwidth on +average, bursts would be handled at the speed defined by RT criterion. Still, +if extra speed is needed (e.g. due to latency), non linear service curves +should be used in such case. + +In the other words: the LS criterion is meaningless in the above example. + +You can quickly "workaround" it by making sure each leaf class has RT service +curve assigned (thus guaranteeing all of them will get some bandwidth), but it +doesn't make it any more valid. + +Keep in mind - if you use nonlinear curves and irregularities explained above +happen \fIonly\fR in the first segment, then there's little wrong with +"overusing" RT curve a bit: + +.nf +A \- ls 5.0Mbit, rt 9Mbit/30ms, then 1Mbit +B \- ls 2.5Mbit +C \- ls 2.5Mbit +.fi + +Here, the vt of A will "spike" in the initial period, but then A will never get more +than 1Mbit until B & C catch up. Then everything will be back to normal. +. +.SH "LINUX AND TIMER RESOLUTION" +. +In certain situations, the scheduler can throttle itself and setup so +called watchdog to wakeup dequeue function at some time later. In case of HFSC +it happens when for example no packet is eligible for scheduling, and UL +service curve is used to limit the speed at which LS criterion is allowed to +dequeue packets. It's called throttling, and accuracy of it is dependent on +how the kernel is compiled. + +There're 3 important options in modern kernels, as far as timers' resolution +goes: \&'tickless system', \&'high resolution timer support' and \&'timer +frequency'. + +If you have \&'tickless system' enabled, then the timer interrupt will trigger +as slowly as possible, but each time a scheduler throttles itself (or any +other part of the kernel needs better accuracy), the rate will be increased as +needed / possible. The ceiling is either \&'timer frequency' if \&'high +resolution timer support' is not available or not compiled in, or it's +hardware dependent and can go \fIfar\fR beyond the highest \&'timer frequency' +setting available. + +If \&'tickless system' is not enabled, the timer will trigger at a fixed rate +specified by \&'timer frequency' \- regardless if high resolution timers are +or aren't available. + +This is important to keep those settings in mind, as in scenario like: no +tickless, no HR timers, frequency set to 100hz \- throttling accuracy would be +at 10ms. It doesn't automatically mean you would be limited to ~0.8Mbit/s +(assuming packets at ~1KB) \- as long as your queues are prepared to cover for +timer inaccuracy. Of course, in case of e.g. locally generated UDP traffic \- +appropriate socket size is needed as well. Short example to make it more +understandable (assume hardcore anti\-schedule settings \- HZ=100, no HR +timers, no tickless): + +.nf +tc qdisc add dev eth0 root handle 1:0 hfsc default 1 +tc class add dev eth0 parent 1:0 classid 1:1 hfsc rt m2 10Mbit +.fi + +Assuming packet of ~1KB size and HZ=100, that averages to ~0.8Mbit \- anything +beyond it (e.g. the above example with specified rate over 10x larger) will +require appropriate queuing and cause bursts every ~10 ms. As you can +imagine, any HFSC's RT guarantees will be seriously invalidated by that. +Aforementioned example is mainly important if you deal with old hardware \- as +is particularly popular for home server chores. Even then, you can easily +set HZ=1000 and have very accurate scheduling for typical adsl speeds. + +Anything modern (apic or even hpet msi based timers + \&'tickless system') +will provide enough accuracy for superb 1Gbit scheduling. For example, on one +of my cheap dual-core AMD boards I have the following settings: + +.nf +tc qdisc add dev eth0 parent root handle 1:0 hfsc default 1 +tc class add dev eth0 parent 1:0 classid 1:1 hfsc rt m2 300mbit +.fi + +And a simple: + +.nf +nc \-u dst.host.com 54321 </dev/zero +nc \-l \-p 54321 >/dev/null +.fi + +\&...will yield the following effects over a period of ~10 seconds (taken from +/proc/interrupts): + +.nf +319: 42124229 0 HPET_MSI\-edge hpet2 (before) +319: 42436214 0 HPET_MSI\-edge hpet2 (after 10s.) +.fi + +That's roughly 31000/s. Now compare it with HZ=1000 setting. The obvious +drawback of it is that cpu load can be rather high with servicing that +many timer interrupts. The example with 300Mbit RT service curve on 1Gbit link is +particularly ugly, as it requires a lot of throttling with minuscule delays. + +Also note that it's just an example showing the capabilities of current hardware. +The above example (essentially a 300Mbit TBF emulator) is pointless on an internal +interface to begin with: you will pretty much always want a regular LS service +curve there, and in such a scenario HFSC simply doesn't throttle at all. + +300Mbit RT service curve (selected columns from mpstat \-P ALL 1): + +.nf +10:56:43 PM CPU %sys %irq %soft %idle +10:56:44 PM all 20.10 6.53 34.67 37.19 +10:56:44 PM 0 35.00 0.00 63.00 0.00 +10:56:44 PM 1 4.95 12.87 6.93 73.27 +.fi + +So, in the rare case you need those speeds with only a RT service curve, or with a UL +service curve: remember the drawbacks. +. +.SH "CAVEAT: RANDOM ONLINE EXAMPLES" +. +For reasons unknown (though well guessed), many examples you can google love to +overuse UL criterion and stuff it in every node possible. This makes no sense +and works against what HFSC tries to do (and does pretty damn well). Use UL +where it makes sense: on the uppermost node to match upstream router's uplink +capacity. Or in special cases, such as testing (limit certain subtree to some +speed), or customers that must never get more than certain speed. In the last +case you can usually achieve the same by just using a RT criterion without LS+UL +on leaf nodes. + +As for the router case - remember it's good to differentiate between "traffic to +router" (remote console, web config, etc.) and "outgoing traffic", so for +example: + +.nf +tc qdisc add dev eth0 root handle 1:0 hfsc default 0x8002 +tc class add dev eth0 parent 1:0 classid 1:999 hfsc rt m2 50Mbit +tc class add dev eth0 parent 1:0 classid 1:1 hfsc ls m2 2Mbit ul m2 2Mbit +.fi + +\&... so "internet" tree under 1:1 and "router itself" as 1:999 +. +.SH "LAYER2 ADAPTATION" +. +Please refer to \fBtc\-stab\fR(8) +. +.SH "SEE ALSO" +. +\fBtc\fR(8), \fBtc\-hfsc\fR(8), \fBtc\-stab\fR(8) + +Please direct bugreports and patches to: <netdev@vger.kernel.org> +. +.SH "AUTHOR" +. +Manpage created by Michal Soltys (soltys@ziu.info) diff --git a/man/man8/.gitignore b/man/man8/.gitignore new file mode 100644 index 0000000..7b08e91 --- /dev/null +++ b/man/man8/.gitignore @@ -0,0 +1,5 @@ +# these pages are built +ip-address.8 +ip-link.8 +ip-netns.8 +ip-route.8 diff --git a/man/man8/Makefile b/man/man8/Makefile new file mode 100644 index 0000000..6dab182 --- /dev/null +++ b/man/man8/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0 +TARGETS = ip-address.8 ip-link.8 ip-netns.8 ip-route.8 + +MAN8PAGES = $(TARGETS) $(filter-out $(TARGETS),$(wildcard *.8)) + +all: $(TARGETS) + +%: %.in + sed \ + -e "s|@NETNS_ETC_DIR@|$(NETNS_ETC_DIR)|g" \ + -e "s|@NETNS_RUN_DIR@|$(NETNS_RUN_DIR)|g" \ + -e "s|@SYSCONF_ETC_DIR@|$(CONF_ETC_DIR)|g" \ + -e "s|@SYSCONF_USR_DIR@|$(CONF_USR_DIR)|g" \ + $< > $@ + +distclean: clean + +clean: + @rm -f $(TARGETS) + +install: + $(INSTALLDIR) $(DESTDIR)$(MANDIR)/man8 + $(INSTALLMAN) $(MAN8PAGES) $(DESTDIR)$(MANDIR)/man8 + +check: all + @for page in $(MAN8PAGES); do test 0 -eq $$($(MAN_CHECK) $$page \ + $(MAN_REDIRECT)) || { echo "Error in $$page"; exit 1; }; done + +.PHONY: install clean distclean check diff --git a/man/man8/arpd.8 b/man/man8/arpd.8 new file mode 100644 index 0000000..5050a98 --- /dev/null +++ b/man/man8/arpd.8 @@ -0,0 +1,69 @@ +.TH ARPD 8 "28 June, 2007" + +.SH NAME +arpd \- userspace arp daemon. + +.SH SYNOPSIS +Usage: arpd [ -lkh? ] [ -a N ] [ -b dbase ] [ -B number ] [ -f file ] [-p interval ] [ -n time ] [ -R rate ] [ <INTERFACES> ] + +.SH DESCRIPTION +The +.B arpd +daemon collects gratuitous ARP information, saving it on local disk and feeding it to the kernel on demand to avoid redundant broadcasting due to limited size of the kernel ARP cache. + +.SH OPTIONS +.TP +-h -? +Print help +.TP +-l +Dump the arpd database to stdout and exit. The output consists of three columns: the interface index, the IP address of the interface, and the MAC address of the interface. Negative entries for dead hosts are also shown, in this case the MAC address is replaced by the word FAILED followed by a colon and the most recent time when the fact that the host is dead was proven. +.TP +-f <FILE> +Read and load an arpd database from FILE in a text format similar to that dumped by option -l. Exit after load, possibly listing resulting database, if option -l is also given. If FILE is -, stdin is read to get the ARP table. +.TP +-b <DATABASE> +the location of the database file. The default location is /var/lib/arpd/arpd.db +.TP +-a <NUMBER> +With this option, arpd not only passively listens for ARP packets on the interface, but also sends broadcast queries itself. NUMBER is the number of such queries to make before a destination is considered dead. When arpd is started as kernel helper (i.e. with app_solicit enabled in sysctl or even with option -k) without this option and still did not learn enough information, you can observe 1 second gaps in service. Not fatal, but not good. +.TP +-k +Suppress sending broadcast queries by the kernel. This option only makes sense together with option -a. +.TP +-n <TIME> +Specifies the timeout of the negative cache. When resolution fails, arpd suppresses further attempts to resolve for this period. This option only makes sense together with option '-k'. This timeout should not be too much longer than the boot time of a typical host not supporting gratuitous ARP. Default value is 60 seconds. +.TP +-p <TIME> +The time to wait in seconds between polling attempts to the kernel ARP table. TIME may be a floating point number. The default value is 30. +.TP +-R <RATE> +Maximal steady rate of broadcasts sent by arpd in packets per second. Default value is 1. +.TP +-B <NUMBER> +The number of broadcasts sent by arpd back to back. Default value is 3. Together with the -R option, this option ensures that the number of ARP queries that are broadcast does not exceed B+R*T over any interval of time T. +.P +<INTERFACES> is a list of names of networking interfaces to watch. If no interfaces are given, arpd monitors all the interfaces. In this case arpd does not adjust sysctl parameters, it is assumed that the user does this himself after arpd is started. +.P +.SH SIGNALS +.TP +When arpd receives a SIGINT or SIGTERM signal, it exits gracefully, syncing the database and restoring adjusted sysctl parameters. On a SIGHUP it syncs the database to disk. With SIGUSR1 it sends some statistics to syslog. The effect of any other signals is undefined. In particular, they may corrupt the database and leave the sysctl parameters in an unpredictable state. +.P +.SH NOTE +.TP +In order for arpd to be able to serve as ARP resolver, the kernel must be compiled with the option CONFIG_ARPD and, in the case when interface list in not given on command line, variable app_solicit on interfaces of interest should be in /proc/sys/net/ipv4/neigh/*. If this is not made arpd still collects gratuitous ARP information in its database. +.SH EXAMPLES +.TP +arpd -b /var/tmp/arpd.db +Start arpd to collect gratuitous ARP, but not messing with kernel functionality. +.TP +killall arpd ; arpd -l -b /var/tmp/arpd.db +Look at result after some time. +.TP +arpd -b /var/tmp/arpd.db -a 1 eth0 eth1 +Enable kernel helper, leaving leading role to kernel. +.TP +arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1 +Completely replace kernel resolution on interfaces eth0 and eth1. In this case the kernel still does unicast probing to validate entries, but all the broadcast activity is suppressed and made under authority of arpd. +.PP +This is the mode in which arpd normally is supposed to work. It is not the default to prevent occasional enabling of too aggressive a mode. diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 new file mode 100644 index 0000000..eeea407 --- /dev/null +++ b/man/man8/bridge.8 @@ -0,0 +1,1602 @@ +.TH BRIDGE 8 "1 August 2012" "iproute2" "Linux" +.SH NAME +bridge \- show / manipulate bridge addresses and devices +.SH SYNOPSIS + +.ad l +.in +8 +.ti -8 +.B bridge +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OBJECT " := { " +.BR link " | " fdb " | " mdb " | " vlan " | " vni " | " monitor " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-s\fR[\fItatistics\fR] | +\fB\-n\fR[\fIetns\fR] name | +\fB\-b\fR[\fIatch\fR] filename | +\fB\-c\fR[\fIolor\fR] | +\fB\-p\fR[\fIretty\fR] | +\fB\-j\fR[\fIson\fR] | +\fB\-o\fR[\fIneline\fR] } + +.ti -8 +.B "bridge link set" +.B dev +.IR DEV " [ " +.B cost +.IR COST " ] [ " +.B priority +.IR PRIO " ] [ " +.B state +.IR STATE " ] [ " +.BR guard " { " on " | " off " } ] [ " +.BR hairpin " { " on " | " off " } ] [ " +.BR fastleave " { " on " | " off " } ] [ " +.BR root_block " { " on " | " off " } ] [ " +.BR learning " { " on " | " off " } ] [ " +.BR learning_sync " { " on " | " off " } ] [ " +.BR flood " { " on " | " off " } ] [ " +.BR hwmode " { " vepa " | " veb " } ] [ " +.BR bcast_flood " { " on " | " off " } ] [ " +.BR mcast_flood " { " on " | " off " } ] [ " +.BR mcast_max_groups +.IR MAX_GROUPS " ] [" +.BR mcast_router +.IR MULTICAST_ROUTER " ] [" +.BR mcast_to_unicast " { " on " | " off " } ] [ " +.BR neigh_suppress " { " on " | " off " } ] [ " +.BR neigh_vlan_suppress " { " on " | " off " } ] [ " +.BR vlan_tunnel " { " on " | " off " } ] [ " +.BR isolated " { " on " | " off " } ] [ " +.BR locked " { " on " | " off " } ] [ " +.BR mab " { " on " | " off " } ] [ " +.B backup_port +.IR DEVICE " ] [" +.BR nobackup_port " ] [ " +.B backup_nhid +.IR NHID " ] [" +.BR self " ] [ " master " ]" + +.ti -8 +.BR "bridge link" " [ " show " ] [ " +.B dev +.IR DEV " ] [" +.B master +.IR DEVICE " ]" + +.ti -8 +.BR "bridge fdb" " { " add " | " append " | " del " | " replace " } " +.I LLADDR +.B dev +.IR DEV " { " +.BR local " | " static " | " dynamic " } [ " +.BR self " ] [ " master " ] [ " router " ] [ " use " ] [ " extern_learn " ] [ " sticky " ] [ " +.B src_vni +.IR VNI " ] { [" +.B dst +.IR IPADDR " ] [ " +.B vni +.IR VNI " ] [" +.B port +.IR PORT " ] [" +.B via +.IR DEVICE " ] | " +.B nhid +.IR NHID " } " + +.ti -8 +.BR "bridge fdb" " [ [ " show " ] [ " +.B br +.IR BRDEV " ] [ " +.B brport +.IR DEV " ] [ " +.B vlan +.IR VID " ] [ " +.B state +.IR STATE " ] [" +.B dynamic +.IR "] ]" + +.ti -8 +.BR "bridge fdb get" " [" +.B to +.IR "]" +.I LLADDR "[ " +.B br +.IR BRDEV " ]" +.B { brport | dev } +.IR DEV " [ " +.B vlan +.IR VID " ] [ " +.B vni +.IR VNI " ] [" +.BR self " ] [ " master " ] [ " dynamic " ]" + +.ti -8 +.BR "bridge fdb flush" +.B dev +.IR DEV " [ " +.B brport +.IR DEV " ] [ " +.B vlan +.IR VID " ] [ " +.B src_vni +.IR VNI " ] [ " +.B nhid +.IR NHID " ] [" +.B vni +.IR VNI " ] [ " +.B port +.IR PORT " ] [" +.B dst +.IR IPADDR " ] [ " +.BR self " ] [ " master " ] [ " +.BR [no]permanent " | " [no]static " | " [no]dynamic " ] [ " +.BR [no]added_by_user " ] [ " [no]extern_learn " ] [ " +.BR [no]sticky " ] [ " [no]offloaded " ] [ " [no]router " ]" + +.ti -8 +.BR "bridge mdb" " { " add " | " del " | " replace " } " +.B dev +.I DEV +.B port +.I PORT +.B grp +.IR GROUP " [ " +.B src +.IR SOURCE " ] [ " +.BR permanent " | " temp " ] [ " +.B vid +.IR VID " ] [ " +.BR filter_mode " { " include " | " exclude " } ] [ " +.B source_list +.IR SOURCE_LIST " ] [ " +.B proto +.IR PROTO " ] [ " +.B dst +.IR IPADDR " ] [ " +.B dst_port +.IR DST_PORT " ] [ " +.B vni +.IR VNI " ] [ " +.B src_vni +.IR SRC_VNI " ] [ " +.B via +.IR DEV " ] + +.ti -8 +.BR "bridge mdb show" " [ " +.B dev +.IR DEV " ]" + +.ti -8 +.B "bridge mdb get" +.BI dev " DEV " grp " GROUP " +.RB "[ " src +.IR SOURCE " ]" +.RB "[ " vid +.IR VID " ]" +.RB "[ " src_vni +.IR SRC_VNI " ]" + +.ti -8 +.B "bridge mdb flush" +.BI dev " DEV " +.RB "[ " port +.IR PORT " ]" +.RB "[ " vid +.IR VID " ]" +.RB "[ " src_vni +.IR SRC_VNI " ]" +.RB "[ " proto +.IR PROTO " ]" +.RB "[ " [no]permanent " ]" +.RB "[ " dst +.IR IPADDR " ]" +.RB "[ " dst_port +.IR DST_PORT " ]" +.RB "[ " vni +.IR VNI " ]" + +.ti -8 +.BR "bridge vlan" " { " add " | " del " } " +.B dev +.I DEV +.B vid +.IR VID " [ " +.B tunnel_info +.IR TUNNEL_ID " ] [ " +.BR pvid " ] [ " untagged " ] [ " +.BR self " ] [ " master " ] " + +.ti -8 +.BR "bridge vlan set" +.B dev +.I DEV +.B vid +.IR VID " [ " +.B state +.IR STP_STATE " ] [ " +.B mcast_max_groups +.IR MAX_GROUPS " ] [ " +.B mcast_router +.IR MULTICAST_ROUTER " ] [ " +.BR neigh_suppress " { " on " | " off " } ]" + +.ti -8 +.BR "bridge vlan" " [ " show " | " tunnelshow " ] [ " +.B dev +.IR DEV " ]" + +.ti -8 +.BR "bridge vlan global set" +.B dev +.I DEV +.B vid +.IR VID " [ " +.B mcast_snooping +.IR MULTICAST_SNOOPING " ] [ " +.B mcast_querier +.IR MULTICAST_QUERIER " ] [ " +.B mcast_igmp_version +.IR IGMP_VERSION " ] [ " +.B mcast_mld_version +.IR MLD_VERSION " ] [ " +.B mcast_last_member_count +.IR LAST_MEMBER_COUNT " ] [ " +.B mcast_last_member_interval +.IR LAST_MEMBER_INTERVAL " ] [ " +.B mcast_startup_query_count +.IR STARTUP_QUERY_COUNT " ] [ " +.B mcast_startup_query_interval +.IR STARTUP_QUERY_INTERVAL " ] [ " +.B mcast_membership_interval +.IR MEMBERSHIP_INTERVAL " ] [ " +.B mcast_querier_interval +.IR QUERIER_INTERVAL " ] [ " +.B mcast_query_interval +.IR QUERY_INTERVAL " ] [ " +.B mcast_query_response_interval +.IR QUERY_RESPONSE_INTERVAL " ]" + +.ti -8 +.BR "bridge vlan global" " [ " show " ] [ " +.B dev +.IR DEV " ] [ " +.B vid +.IR VID " ]" + +.ti -8 +.BR "bridge vlan" " show " [ " +.B dev +.IR DEV " ]" + +.ti -8 +.BR "bridge vni" " { " add " | " del " } " +.B dev +.I DEV +.B vni +.IR VNI " [ { " +.B group | remote "} " +.IR IPADDR " ] " + +.ti -8 +.BR "bridge vni" " show " [ " +.B dev +.IR DEV " ]" + +.ti -8 +.BR "bridge monitor" " [ " all " | " neigh " | " link " | " mdb " | " vlan " ]" + +.SH OPTIONS + +.TP +.BR "\-V" , " -Version" +print the version of the +.B bridge +utility and exit. + +.TP +.BR "\-s" , " \-stats", " \-statistics" +output more information. If this option +is given multiple times, the amount of information increases. +As a rule, the information is statistics or some time values. + +.TP +.BR "\-d" , " \-details" +print detailed information about bridge vlan filter entries or MDB router ports. + +.TP +.BR "\-n" , " \-net" , " \-netns " <NETNS> +switches +.B bridge +to the specified network namespace +.IR NETNS . +Actually it just simplifies executing of: + +.B ip netns exec +.I NETNS +.B bridge +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +to + +.B bridge +.RI "-n[etns] " NETNS " [ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +.TP +.BR "\-b", " \-batch " <FILENAME> +Read commands from provided file or standard input and invoke them. +First failure will cause termination of bridge command. + +.TP +.B "\-force" +Don't terminate bridge command on errors in batch mode. +If there were any errors during execution of the commands, the application +return code will be non zero. + +.TP +.BR \-c [ color ][ = { always | auto | never } +Configure color output. If parameter is omitted or +.BR always , +color output is enabled regardless of stdout state. If parameter is +.BR auto , +stdout is checked to be a terminal before enabling color output. If parameter is +.BR never , +color output is disabled. If specified multiple times, the last one takes +precedence. This flag is ignored if +.B \-json +is also given. + +.TP +.BR "\-j", " \-json" +Output results in JavaScript Object Notation (JSON). + +.TP +.BR "\-p", " \-pretty" +When combined with -j generate a pretty JSON output. + +.TP +.BR "\-o", " \-oneline" +output each record on a single line, replacing line feeds +with the +.B '\e' +character. This is convenient when you want to count records +with +.BR wc (1) +or to +.BR grep (1) +the output. + + +.SH BRIDGE - COMMAND SYNTAX + +.SS +.I OBJECT + +.TP +.B link +- Bridge port. + +.TP +.B fdb +- Forwarding Database entry. + +.TP +.B mdb +- Multicast group database entry. + +.TP +.B vlan +- VLAN filter list. + +.TP +.B vni +- VNI filter list. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to +.BR "add" , " delete" +and +.B show +(or +.B list +) objects, but some objects do not allow all of these operations +or have some additional commands. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B list +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH bridge link - bridge port + +.B link +objects correspond to the port devices of the bridge. + +.P +The corresponding commands set and display port status and bridge specific +attributes. + +.SS bridge link set - set bridge specific attributes on a port + +.TP +.BI dev " NAME " +interface name of the bridge port + +.TP +.BI cost " COST " +the STP path cost of the specified port. + +.TP +.BI priority " PRIO " +the STP port priority. The priority value is an unsigned 8-bit quantity +(number between 0 and 255). This metric is used in the designated port an +droot port selection algorithms. + +.TP +.BI state " STATE " +the operation state of the port. Except state 0 (disable STP or BPDU filter feature), +this is primarily used by user space STP/RSTP +implementation. One may enter port state name (case insensitive), or one of the +numbers below. Negative inputs are ignored, and unrecognized names return an +error. + +.B 0 +- port is in STP +.B DISABLED +state. Make this port completely inactive for STP. This is also called +BPDU filter and could be used to disable STP on an untrusted port, like +a leaf virtual devices. +.sp + +.B 1 +- port is in STP +.B LISTENING +state. Only valid if STP is enabled on the bridge. In this +state the port listens for STP BPDUs and drops all other traffic frames. +.sp + +.B 2 +- port is in STP +.B LEARNING +state. Only valid if STP is enabled on the bridge. In this +state the port will accept traffic only for the purpose of updating MAC +address tables. +.sp + +.B 3 +- port is in STP +.B FORWARDING +state. Port is fully active. +.sp + +.B 4 +- port is in STP +.B BLOCKING +state. Only valid if STP is enabled on the bridge. This state +is used during the STP election process. In this state, port will only process +STP BPDUs. +.sp + +.TP +.BR "guard on " or " guard off " +Controls whether STP BPDUs will be processed by the bridge port. By default, +the flag is turned off allowed BPDU processing. Turning this flag on will +disables +the bridge port if a STP BPDU packet is received. + +If running Spanning Tree on bridge, hostile devices on the network +may send BPDU on a port and cause network failure. Setting +.B guard on +will detect and stop this by disabling the port. +The port will be restarted if link is brought down, or +removed and reattached. For example if guard is enable on +eth0: + +.B ip link set dev eth0 down; ip link set dev eth0 up + +.TP +.BR "hairpin on " or " hairpin off " +Controls whether traffic may be send back out of the port on which it was +received. This option is also called reflective relay mode, and is used to support +basic VEPA (Virtual Ethernet Port Aggregator) capabilities. +By default, this flag is turned off and the bridge will not forward +traffic back out of the receiving port. + +.TP +.BR "fastleave on " or " fastleave off " +This flag allows the bridge to immediately stop multicast traffic on a port +that receives IGMP Leave message. It is only used with IGMP snooping is +enabled on the bridge. By default the flag is off. + +.TP +.BR "root_block on " or " root_block off " +Controls whether a given port is allowed to become root port or not. Only used +when STP is enabled on the bridge. By default the flag is off. + +This feature is also called root port guard. +If BPDU is received from a leaf (edge) port, it should not +be elected as root port. This could be used if using STP on a bridge and the downstream bridges are not fully +trusted; this prevents a hostile guest from rerouting traffic. + +.TP +.BR "learning on " or " learning off " +Controls whether a given port will learn MAC addresses from received traffic or +not. If learning if off, the bridge will end up flooding any traffic for which +it has no FDB entry. By default this flag is on. + +.TP +.BR "learning_sync on " or " learning_sync off " +Controls whether a given port will sync MAC addresses learned on device port to +bridge FDB. + +.TP +.BR "flood on " or " flood off " +Controls whether unicast traffic for which there is no FDB entry will be +flooded towards this given port. By default this flag is on. + +.TP +.B hwmode +Some network interface cards support HW bridge functionality and they may be +configured in different modes. Currently support modes are: + +.B vepa +- Data sent between HW ports is sent on the wire to the external +switch. + +.B veb +- bridging happens in hardware. + +.TP +.BR "bcast_flood on " or " bcast_flood off " +Controls flooding of broadcast traffic on the given port. +By default this flag is on. + +.TP +.BR "mcast_flood on " or " mcast_flood off " +Controls whether multicast traffic for which there is no MDB entry will be +flooded towards this given port. By default this flag is on. + +.TP +.BI mcast_max_groups " MAX_GROUPS " +Sets the maximum number of MDB entries that can be registered for a given +port. Attempts to register more MDB entries at the port than this limit +allows will be rejected, whether they are done through netlink (e.g. the +\fBbridge\fR tool), or IGMP or MLD membership reports. Setting a limit to 0 +has the effect of disabling the limit. The default value is 0. See also the +\fBip link\fR option \fBmcast_hash_max\fR. + +.TP +.BI mcast_router " MULTICAST_ROUTER " +This flag is almost the same as the per-VLAN flag, see below, except its +value can only be set in the range 0-2. The default is +.B 1 +where the bridge figures out automatically where an IGMP/MLD querier, +MRDISC capable device, or PIM router, is located. Setting this flag to +.B 2 +is useful in cases where the multicast router does not indicate its +presence in any meaningful way (e.g. older versions of SMCRoute, or +mrouted), or when there is a need for forwarding both known and unknown +IP multicast to a secondary/backup router. + +.TP +.BR "mcast_to_unicast on " or " mcast_to_unicast off " +Controls whether a given port will replicate packets using unicast +instead of multicast. By default this flag is off. + +This is done by copying the packet per host and +changing the multicast destination MAC to a unicast one accordingly. + +.B mcast_to_unicast +works on top of the multicast snooping feature of +the bridge. Which means unicast copies are only delivered to hosts which +are interested in it and signalized this via IGMP/MLD reports +previously. + +This feature is intended for interface types which have a more reliable +and/or efficient way to deliver unicast packets than broadcast ones +(e.g. WiFi). + +However, it should only be enabled on interfaces where no IGMPv2/MLDv1 +report suppression takes place. IGMP/MLD report suppression issue is usually +overcome by the network daemon (supplicant) enabling AP isolation and +by that separating all STAs. + +Delivery of STA-to-STA IP multicast is made possible again by +enabling and utilizing the bridge hairpin mode, which considers the +incoming port as a potential outgoing port, too (see +.B hairpin +option). +Hairpin mode is performed after multicast snooping, therefore leading to +only deliver reports to STAs running a multicast router. + +.TP +.BR "neigh_suppress on " or " neigh_suppress off " +Controls whether neigh discovery (arp and nd) proxy and suppression is +enabled on the port. By default this flag is off. + +.TP +.BR "neigh_vlan_suppress on " or " neigh_vlan_suppress off " +Controls whether per-VLAN neigh discovery (arp and nd) proxy and suppression is +enabled on the port. When on, the \fBbridge link\fR option \fBneigh_suppress\fR +has no effect and the per-VLAN state is set using the \fBbridge vlan\fR option +\fBneigh_suppress\fR. By default this flag is off. + +.TP +.BR "vlan_tunnel on " or " vlan_tunnel off " +Controls whether vlan to tunnel mapping is enabled on the port. By +default this flag is off. + +.TP +.BR "isolated on " or " isolated off " +Controls whether a given port will be isolated, which means it will be +able to communicate with non-isolated ports only. By default this +flag is off. + +.TP +.BR "locked on " or " locked off " +Controls whether a port is locked or not. When locked, non-link-local frames +received through the port are dropped unless an FDB entry with the MAC source +address points to the port. The common use case is IEEE 802.1X where hosts can +authenticate themselves by exchanging EAPOL frames with an authenticator. After +authentication is complete, the user space control plane can install a matching +FDB entry to allow traffic from the host to be forwarded by the bridge. When +learning is enabled on a locked port, the +.B no_linklocal_learn +bridge option needs to be on to prevent the bridge from learning from received +EAPOL frames. By default this flag is off. + +.TP +.BR "mab on " or " mab off " +Controls whether MAC Authentication Bypass (MAB) is enabled on the port or not. +MAB can only be enabled on a locked port that has learning enabled. When +enabled, FDB entries are learned from received traffic and have the "locked" +FDB flag set. The flag can only be set by the kernel and it indicates that the +FDB entry cannot be used to authenticate the corresponding host. User space can +decide to authenticate the host by replacing the FDB entry and clearing the +"locked" FDB flag. Locked FDB entries can roam to unlocked (authorized) ports +in which case the "locked" flag is cleared. FDB entries cannot roam to locked +ports regardless of MAB being enabled or not. Therefore, locked FDB entries are +only created if an FDB entry with the given {MAC, VID} does not already exist. +This behavior prevents unauthenticated hosts from disrupting traffic destined +to already authenticated hosts. Locked FDB entries act like regular dynamic +entries with respect to forwarding and aging. By default this flag is off. + +.TP +.BI backup_port " DEVICE" +If the port loses carrier all traffic will be redirected to the +configured backup port + +.TP +.B nobackup_port +Removes the currently configured backup port + +.TP +.BI backup_nhid " NHID" +The FDB nexthop object ID (see \fBip-nexthop\fR(8)) to attach to packets being +redirected to a backup port that has VLAN tunnel mapping enabled (via the +\fBvlan_tunnel\fR option). Setting a value of 0 (default) has the effect of not +attaching any ID. + +.TP +.B self +link setting is configured on specified physical device + +.TP +.B master +link setting is configured on the software bridge (default) + +.TP +.BR "\-t" , " \-timestamp" +display current time when using monitor option. + +.SS bridge link show - list ports configuration for all bridges. + +This command displays ports configuration and flags for all bridges by default. + +.TP +.BI dev " DEV" +only display the specific bridge port named DEV. + +.TP +.BI master " DEVICE" +only display ports of the bridge named DEVICE. This is similar to +"ip link show master <bridge_device>" command. + +.SH bridge fdb - forwarding database management + +.B fdb +objects contain known Ethernet addresses on a link. + +.P +The corresponding commands display fdb entries, add new entries, +append entries, +and delete old ones. + +.SS bridge fdb add - add a new fdb entry + +This command creates a new fdb entry. + +.TP +.B LLADDR +the Ethernet MAC address. + +.TP +.BI dev " DEV" +the interface to which this address is associated. + +.B local +- is a local permanent fdb entry, which means that the bridge will not forward +frames with this destination MAC address and VLAN ID, but terminate them +locally. This flag is default unless "static" or "dynamic" are explicitly +specified. +.sp + +.B permanent +- this is a synonym for "local" +.sp + +.B static +- is a static (no arp) fdb entry +.sp + +.B dynamic +- is a dynamic reachable age-able fdb entry +.sp + +.B self +- the operation is fulfilled directly by the driver for the specified network +device. If the network device belongs to a master like a bridge, then the +bridge is bypassed and not notified of this operation (and if the device does +notify the bridge, it is driver-specific behavior and not mandated by this +flag, check the driver for more details). The "bridge fdb add" command can also +be used on the bridge device itself, and in this case, the added fdb entries +will be locally terminated (not forwarded). In the latter case, the "self" flag +is mandatory. The flag is set by default if "master" is not specified. +.sp + +.B master +- if the specified network device is a port that belongs to a master device +such as a bridge, the operation is fulfilled by the master device's driver, +which may in turn notify the port driver too of the address. If the specified +device is a master itself, such as a bridge, this flag is invalid. +.sp + +.B router +- the destination address is associated with a router. +Valid if the referenced device is a VXLAN type device and has +route short circuit enabled. +.sp + +.B use +- the address is in use. User space can use this option to +indicate to the kernel that the fdb entry is in use. +.sp + +.B extern_learn +- this entry was learned externally. This option can be used to +indicate to the kernel that an entry was hardware or user-space +controller learnt dynamic entry. Kernel will not age such an entry. +.sp + +.B sticky +- this entry will not change its port due to learning. +.sp + +.in -8 +The next command line parameters apply only +when the specified device +.I DEV +is of type VXLAN. +.TP +.BI dst " IPADDR" +the IP address of the destination +VXLAN tunnel endpoint where the Ethernet MAC ADDRESS resides. + +.TP +.BI src_vni " VNI" +the src VNI Network Identifier (or VXLAN Segment ID) +this entry belongs to. Used only when the vxlan device is in +external or collect metadata mode. If omitted the value specified at +vxlan device creation will be used. + +.TP +.BI vni " VNI" +the VXLAN VNI Network Identifier (or VXLAN Segment ID) +to use to connect to the remote VXLAN tunnel endpoint. +If omitted the value specified at vxlan device creation +will be used. + +.TP +.BI port " PORT" +the UDP destination PORT number to use to connect to the +remote VXLAN tunnel endpoint. +If omitted the default value is used. + +.TP +.BI via " DEVICE" +device name of the outgoing interface for the +VXLAN device driver to reach the +remote VXLAN tunnel endpoint. + +.TP +.BI nhid " NHID " +ecmp nexthop group for the VXLAN device driver +to reach remote VXLAN tunnel endpoints. + +.SS bridge fdb append - append a forwarding database entry +This command adds a new fdb entry with an already known +.IR LLADDR . +Valid only for multicast link layer addresses. +The command adds support for broadcast and multicast +Ethernet MAC addresses. +The Ethernet MAC address is added multiple times into +the forwarding database and the vxlan device driver +sends a copy of the data packet to each entry found. + +.PP +The arguments are the same as with +.BR "bridge fdb add" . + +.SS bridge fdb delete - delete a forwarding database entry +This command removes an existing fdb entry. + +.PP +The arguments are the same as with +.BR "bridge fdb add" . + +.SS bridge fdb replace - replace a forwarding database entry +If no matching entry is found, a new one will be created instead. + +.PP +The arguments are the same as with +.BR "bridge fdb add" . + +.SS bridge fdb show - list forwarding entries. + +This command displays the current forwarding table. + +.PP +With the +.B -statistics +option, the command becomes verbose. It prints out the last updated +and last used time for each entry. + +.SS bridge fdb get - get bridge forwarding entry. + +lookup a bridge forwarding table entry. + +.TP +.B LLADDR +the Ethernet MAC address. + +.TP +.BI dev " DEV" +the interface to which this address is associated. + +.TP +.BI brport " DEV" +the bridge port to which this address is associated. same as dev above. + +.TP +.BI br " DEV" +the bridge to which this address is associated. + +.TP +.B self +- the address is associated with the port drivers fdb. Usually hardware. + +.TP +.B master +- the address is associated with master devices fdb. Usually software (default). + +.SS bridge fdb flush - flush bridge forwarding table entries. + +flush the matching bridge forwarding table entries. Some options below have a negated +form when "no" is prepended to them (e.g. permanent and nopermanent). + +.TP +.BI dev " DEV" +the target device for the operation. If the device is a bridge port and "master" +is set then the operation will be fulfilled by its master device's driver and +all entries pointing to that port will be deleted. + +.TP +.BI brport " DEV" +the target bridge port for the operation. If the bridge device is specified then only +entries pointing to the bridge itself will be deleted. Note that the target device +specified by this option will override the one specified by dev above. + +.TP +.BI vlan " VID" +the target VLAN ID for the operation. Match forwarding table entries only with the +specified VLAN ID. + +.TP +.BI src_vni " VNI" +the src VNI Network Identifier (or VXLAN Segment ID) for the operation. Match +forwarding table entries only with the specified VNI. Valid if the referenced +device is a VXLAN type device. + +.TP +.BI nhid " NHID" +the ECMP nexthop group for the operation. Match forwarding table entries only +with the specified NHID. Valid if the referenced device is a VXLAN type device. + +.TP +.BI vni " VNI" +the VXLAN VNI Network Identifier (or VXLAN Segment ID) for the operation. Match +forwarding table entries only with the specified VNI. Valid if the referenced +device is a VXLAN type device. + +.TP +.BI port " PORT" +the UDP destination PORT number for the operation. Match forwarding table +entries only with the specified PORT. Valid if the referenced device is a VXLAN +type device. + +.TP +.BI dst " IPADDR" +the IP address of the destination VXLAN tunnel endpoint for the operation. Match +forwarding table entries only with the specified IPADDR. Valid if the referenced +device is a VXLAN type device. + +.TP +.B self +the operation is fulfilled directly by the driver for the specified network +device. If the network device belongs to a master like a bridge, then the +bridge is bypassed and not notified of this operation. The "bridge fdb flush" +command can also be used on the bridge device itself. The flag is set by default if +"master" is not specified. + +.TP +.B master +if the specified network device is a port that belongs to a master device +such as a bridge, the operation is fulfilled by the master device's driver. +Flush with both 'master' and 'self' is not recommended with attributes that are +not supported by all devices (e.g., vlan, vni). Such command will be handled by +bridge or VXLAN driver, but will return an error from the driver that does not +support the attribute. Instead, run flush twice - once with 'self' and once +with 'master', and each one with the supported attributes. + +.TP +.B [no]permanent +if specified then only permanent entries will be deleted or respectively if "no" +is prepended then only non-permanent entries will be deleted. + +.TP +.B [no]static +if specified then only static entries will be deleted or respectively if "no" +is prepended then only non-static entries will be deleted. + +.TP +.B [no]dynamic +if specified then only dynamic entries will be deleted or respectively if "no" +is prepended then only non-dynamic (static or permanent) entries will be deleted. + +.TP +.B [no]added_by_user +if specified then only entries with added_by_user flag will be deleted or respectively +if "no" is prepended then only entries without added_by_user flag will be deleted. + +.TP +.B [no]extern_learn +if specified then only entries with extern_learn flag will be deleted or respectively +if "no" is prepended then only entries without extern_learn flag will be deleted. + +.TP +.B [no]sticky +if specified then only entries with sticky flag will be deleted or respectively +if "no" is prepended then only entries without sticky flag will be deleted. + +.TP +.B [no]offloaded +if specified then only entries with offloaded flag will be deleted or respectively +if "no" is prepended then only entries without offloaded flag will be deleted. +.sp + +.TP +.B [no]router +if specified then only entries with router flag will be deleted or respectively +if "no" is prepended then only entries without router flag will be deleted. Valid +if the referenced device is a VXLAN type device. +.sp + +.SH bridge mdb - multicast group database management + +.B mdb +objects contain known IP or L2 multicast group addresses on a link. + +.P +The corresponding commands display mdb entries, add new entries, replace +entries and delete old ones. + +.SS bridge mdb add - add a new multicast group database entry + +This command creates a new mdb entry. + +.TP +.BI dev " DEV" +the interface where this group address is associated. + +.TP +.BI port " PORT" +the port whose link is known to have members of this multicast group. + +.TP +.BI grp " GROUP" +the multicast group address (IPv4, IPv6 or L2 multicast) whose members reside +on the link connected to the port. + +.B permanent +- the mdb entry is permanent. Optional for IPv4 and IPv6, mandatory for L2. +.sp + +.B temp +- the mdb entry is temporary (default) +.sp + +.TP +.BI src " SOURCE" +optional source IP address of a sender for this multicast group. If IGMPv3 for IPv4, or +MLDv2 for IPv6 respectively, are enabled it will be included in the lookup when +forwarding multicast traffic. + +.TP +.BI vid " VID" +the VLAN ID which is known to have members of this multicast group. + +.TP +.BR "filter_mode include " or " filter_mode exclude " +controls whether the sources in the entry's source list are in INCLUDE or +EXCLUDE mode. Can only be set for (*, G) entries. + +.TP +.BI source_list " SOURCE_LIST" +optional list of source IP addresses of senders for this multicast group, +separated by a ','. Whether the entry forwards packets from these senders or +not is determined by the entry's filter mode, which becomes a mandatory +argument. Can only be set for (*, G) entries. + +.TP +.BI proto " PROTO" +the routing protocol identifier of this mdb entry. Can be a number or a string +from the file /etc/iproute2/rt_protos. If the routing protocol is not given, +then +.B static +is assumed. + +.in -8 +The next command line parameters apply only +when the specified device +.I DEV +is of type VXLAN. + +.TP +.BI dst " IPADDR" +the IP address of the destination +VXLAN tunnel endpoint where the multicast receivers reside. + +.TP +.BI dst_port " DST_PORT" +the UDP destination port number to use to connect to the remote VXLAN tunnel +endpoint. If omitted, the value specified at VXLAN device creation will be +used. + +.TP +.BI vni " VNI" +the VXLAN VNI Network Identifier to use to connect to the remote VXLAN tunnel +endpoint. If omitted, the value specified at VXLAN device creation will be used +or the source VNI when the VXLAN device is in external mode. + +.TP +.BI src_vni " SRC_VNI" +the source VNI Network Identifier this entry belongs to. Used only when the +VXLAN device is in external mode. If omitted, the value specified at VXLAN +device creation will be used. + +.TP +.BI via " DEV" +device name of the outgoing interface for the VXLAN device to reach the remote +VXLAN tunnel endpoint. + +.in -8 +The 0.0.0.0 and :: MDB entries are special catchall entries used to flood IPv4 +and IPv6 unregistered multicast packets, respectively. Therefore, when these +entries are programmed, the catchall 00:00:00:00:00:00 FDB entry will only +flood broadcast, unknown unicast and link-local multicast. + +.in -8 +.SS bridge mdb delete - delete a multicast group database entry +This command removes an existing mdb entry. + +.PP +The arguments are the same as with +.BR "bridge mdb add" . + +.SS bridge mdb replace - replace a multicast group database entry +If no matching entry is found, a new one will be created instead. + +.PP +The arguments are the same as with +.BR "bridge mdb add" . + +.SS bridge mdb show - list multicast group database entries + +This command displays the current multicast group membership table. The table +is populated by IGMP and MLD snooping in the bridge driver automatically. It +can be altered by +.B bridge mdb add +and +.B bridge mdb del +commands manually too. + +.TP +.BI dev " DEV" +the interface only whose entries should be listed. Default is to list all +bridge interfaces. + +.PP +With the +.B -details +option, the command becomes verbose. It prints out the ports known to have +a connected router. + +.PP +With the +.B -statistics +option, the command displays timer values for mdb and router port entries. + +.SS bridge mdb get - get multicast group database entry. + +This command retrieves a multicast group database entry based on its key. + +.TP +.BI dev " DEV" +the interface where this group address is associated. + +.TP +.BI grp " GROUP" +the multicast group address (IPv4, IPv6 or L2 multicast). + +.TP +.BI src " SOURCE" +the source IP address. Only relevant when retrieving an (S, G) entry. + +.TP +.BI vid " VID" +the VLAN ID. Only relevant when the bridge is VLAN-aware. + +.TP +.BI src_vni " SRC_VNI" +the source VNI Network Identifier. Only relevant when the VXLAN device is in +external mode. + +.SS bridge mdb flush - flush multicast group database entries. + +This command flushes the matching multicast group database entries. + +.TP +.BI dev " DEV" +the interface where this group address is associated. + +.TP +.BI port " PORT" +the target port for the operation. If the bridge device is specified then only +entries pointing to the bridge itself will be deleted. + +.TP +.BI vid " VID" +the VLAN ID for the operation. Match entries only with the specified VLAN ID. + +.TP +.BI src_vni " SRC_VNI" +the source VNI Network Identifier for the operation. Match entries only with +the specified source VNI. + +.TP +.BI proto " PROTO" +the routing protocol identifier for the operation. Match entries only with the +specified routing protocol. Can be a number or a string from the file +/etc/iproute2/rt_protos. + +.TP +.B [no]permanent +if specified then only permanent entries will be deleted or respectively if +"no" is prepended then only non-permanent (temp) entries will be deleted. + +.TP +.BI dst " IPADDR" +the IP address of the destination VXLAN tunnel endpoint where the multicast +receivers reside. Match entries only with the specified destination IP. + +.TP +.BI dst_port " DST_PORT" +the UDP destination port number to use to connect to the remote VXLAN tunnel +endpoint. Match entries only with the specified destination port number. + +.TP +.BI vni " VNI" +the VXLAN VNI Network Identifier to use to connect to the remote VXLAN tunnel +endpoint. Match entries only with the specified destination VNI. + +.SH bridge vlan - VLAN filter list + +.B vlan +objects contain known VLAN IDs for a link. + +.P +The corresponding commands display vlan filter entries, add new entries, +and delete old ones. + +.SS bridge vlan add - add a new vlan filter entry + +This command creates a new vlan filter entry. + +.TP +.BI dev " NAME" +the interface with which this vlan is associated. + +.TP +.BI vid " VID" +the VLAN ID that identifies the vlan. + +.TP +.BI tunnel_info " TUNNEL_ID" +the TUNNEL ID that maps to this vlan. The tunnel id is set in +dst_metadata for every packet that belongs to this vlan (applicable to +bridge ports with vlan_tunnel flag set). + +.TP +.B pvid +the vlan specified is to be considered a PVID at ingress. +Any untagged frames will be assigned to this VLAN. + +.TP +.B untagged +the vlan specified is to be treated as untagged on egress. + +.TP +.B self +the vlan is configured on the specified physical device. Required if the +device is the bridge device. + +.TP +.B master +the vlan is configured on the software bridge (default). + +.SS bridge vlan delete - delete a vlan filter entry +This command removes an existing vlan filter entry. + +.PP +The arguments are the same as with +.BR "bridge vlan add". +The +.BR "pvid " and " untagged" +flags are ignored. + +.SS bridge vlan set - change vlan filter entry's options + +This command changes vlan filter entry's options. + +.TP +.BI dev " NAME" +the interface with which this vlan is associated. + +.TP +.BI vid " VID" +the VLAN ID that identifies the vlan. + +.TP +.BI state " STP_STATE " +the operation state of the vlan. One may enter STP state name (case insensitive), or one of the +numbers below. Negative inputs are ignored, and unrecognized names return an +error. Note that the state is set only for the vlan of the specified device, e.g. if it is +a bridge port then the state will be set only for the vlan of the port. + +.B 0 +- vlan is in STP +.B DISABLED +state. Make this vlan completely inactive for STP. This is also called +BPDU filter and could be used to disable STP on an untrusted vlan. +.sp + +.B 1 +- vlan is in STP +.B LISTENING +state. Only valid if STP is enabled on the bridge. In this +state the vlan listens for STP BPDUs and drops all other traffic frames. +.sp + +.B 2 +- vlan is in STP +.B LEARNING +state. Only valid if STP is enabled on the bridge. In this +state the vlan will accept traffic only for the purpose of updating MAC +address tables. +.sp + +.B 3 +- vlan is in STP +.B FORWARDING +state. This is the default vlan state. +.sp + +.B 4 +- vlan is in STP +.B BLOCKING +state. Only valid if STP is enabled on the bridge. This state +is used during the STP election process. In this state, the vlan will only process +STP BPDUs. +.sp + +.TP +.BI mcast_max_groups " MAX_GROUPS " +Sets the maximum number of MDB entries that can be registered for a given +VLAN on a given port. A VLAN-specific equivalent of the per-port option of +the same name, see above for details. + +Note that this option is only available when \fBip link\fR option +\fBmcast_vlan_snooping\fR is enabled. + +.TP +.BI mcast_router " MULTICAST_ROUTER " +configure this vlan and interface's multicast router mode, note that only modes +0 - 2 are available for bridge devices. +A vlan and interface with a multicast router will receive all multicast traffic. +.I MULTICAST_ROUTER +may be either +.sp +.B 0 +- to disable multicast router. +.sp + +.B 1 +- to let the system detect the presence of routers (default). +.sp + +.B 2 +- to permanently enable multicast traffic forwarding on this vlan and interface. +.sp + +.B 3 +- to temporarily mark this vlan and port as having a multicast router, i.e. +enable multicast traffic forwarding. This mode is available only for ports. +.sp + +.TP +.BR "neigh_suppress on " or " neigh_suppress off " +Controls whether neigh discovery (arp and nd) proxy and suppression is enabled +for a given VLAN on a given port. By default this flag is off. + +Note that this option only takes effect when \fBbridge link\fR option +\fBneigh_vlan_suppress\fR is enabled for a given port. + +.SS bridge vlan show - list vlan configuration. + +This command displays the current VLAN filter table. + +.PP +With the +.B -details +option, the command becomes verbose. It displays the per-vlan options. + +.PP +With the +.B -statistics +option, the command displays per-vlan traffic statistics. + +.SS bridge vlan tunnelshow - list vlan tunnel mapping. + +This command displays the current vlan tunnel info mapping. + +.SS bridge vlan global set - change vlan filter entry's global options + +This command changes vlan filter entry's global options. + +.TP +.BI dev " NAME" +the interface with which this vlan is associated. Only bridge devices are +supported for global options. + +.TP +.BI vid " VID" +the VLAN ID that identifies the vlan. + +.TP +.BI mcast_snooping " MULTICAST_SNOOPING " +turn multicast snooping for VLAN entry with VLAN ID on +.RI ( MULTICAST_SNOOPING " > 0) " +or off +.RI ( MULTICAST_SNOOPING " == 0). Default is on. " + +.TP +.BI mcast_querier " MULTICAST_QUERIER " +enable +.RI ( MULTICAST_QUERIER " > 0) " +or disable +.RI ( MULTICAST_QUERIER " == 0) " +IGMP/MLD querier, ie sending of multicast queries by the bridge. Default is disabled. + +.TP +.BI mcast_igmp_version " IGMP_VERSION " +set the IGMP version. Default is 2. + +.TP +.BI mcast_mld_version " MLD_VERSION " +set the MLD version. Default is 1. + +.TP +.BI mcast_last_member_count " LAST_MEMBER_COUNT " +set multicast last member count, ie the number of queries the bridge +will send before stopping forwarding a multicast group after a "leave" +message has been received. Default is 2. + +.TP +.BI mcast_last_member_interval " LAST_MEMBER_INTERVAL " +interval between queries to find remaining members of a group, +after a "leave" message is received. + +.TP +.BI mcast_startup_query_count " STARTUP_QUERY_COUNT " +set the number of queries to send during startup phase. Default is 2. + +.TP +.BI mcast_startup_query_interval " STARTUP_QUERY_INTERVAL " +interval between queries in the startup phase. + +.TP +.BI mcast_membership_interval " MEMBERSHIP_INTERVAL " +delay after which the bridge will leave a group, +if no membership reports for this group are received. + +.TP +.BI mcast_querier_interval " QUERIER_INTERVAL " +interval between queries sent by other routers. If no queries are seen +after this delay has passed, the bridge will start to send its own queries +(as if +.BI mcast_querier +was enabled). + +.TP +.BI mcast_query_interval " QUERY_INTERVAL " +interval between queries sent by the bridge after the end of the +startup phase. + +.TP +.BI mcast_query_response_interval " QUERY_RESPONSE_INTERVAL " +set the Max Response Time/Maximum Response Delay for IGMP/MLD +queries sent by the bridge. + +.SS bridge vlan global show - list global vlan options. + +This command displays the global VLAN options for each VLAN entry. + +.TP +.BI dev " DEV" +the interface only whose VLAN global options should be listed. Default is to list +all bridge interfaces. + +.TP +.BI vid " VID" +the VLAN ID only whose global options should be listed. Default is to list +all vlans. + +.SH bridge vni - VNI filter list + +.B vni +objects contain known VNI IDs for a dst metadata vxlan link. + +.P +The corresponding commands display vni filter entries, add new entries, +and delete old ones. + +.SS bridge vni add - add a new vni filter entry + +This command creates a new vni filter entry. + +.TP +.BI dev " NAME" +the interface with which this vni is associated. + +.TP +.BI vni " VNI" +the VNI ID that identifies the vni. + +.TP +.BI remote " IPADDR" +specifies the unicast destination IP address to use in outgoing packets +when the destination link layer address is not known in the VXLAN device +forwarding database. This parameter cannot be specified with the group. + +.TP +.BI group " IPADDR" +specifies the multicast IP address to join for this VNI + +.SS bridge vni del - delete a new vni filter entry + +This command removes an existing vni filter entry. + +.PP +The arguments are the same as with +.BR "bridge vni add". + +.SS bridge vni show - list vni filtering configuration. + +This command displays the current vni filter table. + +.PP +With the +.B -statistics +option, the command displays per-vni traffic statistics. + +.TP +.BI dev " NAME" +shows vni filtering table associated with the vxlan device + +.SH bridge monitor - state monitoring + +The +.B bridge +utility can monitor the state of devices and addresses +continuously. This option has a slightly different format. +Namely, the +.B monitor +command is the first in the command line and then the object list follows: + +.BR "bridge monitor" " [ " all " |" +.IR OBJECT-LIST " ]" + +.I OBJECT-LIST +is the list of object types that we want to monitor. +It may contain +.BR link ", " fdb ", " vlan " and " mdb "." +If no +.B file +argument is given, +.B bridge +opens RTNETLINK, listens on it and dumps state changes in the format +described in previous sections. + +.P +If a file name is given, it does not listen on RTNETLINK, +but opens the file containing RTNETLINK messages saved in binary format +and dumps them. + +.SH NOTES +This command uses facilities added in Linux 3.0. + +Although the forwarding table is maintained on a per-bridge device basis +the bridge device is not part of the syntax. This is a limitation of the +underlying netlink neighbour message protocol. When displaying the +forwarding table, entries for all bridges are displayed. +Add/delete/modify commands determine the underlying bridge device +based on the bridge to which the corresponding ethernet device is attached. + + +.SH SEE ALSO +.BR ip (8) +.SH BUGS +.RB "Please direct bugreports and patches to: " <netdev@vger.kernel.org> + +.SH AUTHOR +Original Manpage by Stephen Hemminger diff --git a/man/man8/ctstat.8 b/man/man8/ctstat.8 new file mode 100644 index 0000000..080e2b2 --- /dev/null +++ b/man/man8/ctstat.8 @@ -0,0 +1 @@ +.so man8/lnstat.8 diff --git a/man/man8/dcb-app.8 b/man/man8/dcb-app.8 new file mode 100644 index 0000000..be505a0 --- /dev/null +++ b/man/man8/dcb-app.8 @@ -0,0 +1,269 @@ +.TH DCB-APP 8 "6 December 2020" "iproute2" "Linux" +.SH NAME +dcb-app \- show / manipulate application priority table of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B app +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb app " { " show " | " flush " } " dev +.RI DEV +.RB "[ " default-prio " ]" +.RB "[ " ethtype-prio " ]" +.RB "[ " stream-port-prio " ]" +.RB "[ " dgram-port-prio " ]" +.RB "[ " port-prio " ]" +.RB "[ " dscp-prio " ]" +.RB "[ " pcp-prio " ]" + +.ti -8 +.B dcb app " { " add " | " del " | " replace " } " dev +.RI DEV +.RB "[ " default-prio " " \fIPRIO-LIST\fB " ]" +.RB "[ " ethtype-prio " " \fIET-MAP\fB " ]" +.RB "[ " stream-port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " dgram-port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " dscp-prio " " \fIDSCP-MAP\fB " ]" +.RB "[ " pcp-prio " " \fIPCP-MAP\fB " ]" + +.ti -8 +.IR PRIO-LIST " := [ " PRIO-LIST " ] " PRIO + +.ti -8 +.IR ET-MAP " := [ " ET-MAP " ] " ET-MAPPING + +.ti -8 +.IR ET-MAPPING " := " ET\fB:\fIPRIO\fR + +.ti -8 +.IR PORT-MAP " := [ " PORT-MAP " ] " PORT-MAPPING + +.ti -8 +.IR PORT-MAPPING " := " PORT\fB:\fIPRIO\fR + +.ti -8 +.IR DSCP-MAP " := [ " DSCP-MAP " ] " DSCP-MAPPING + +.ti -8 +.IR DSCP-MAPPING " := { " DSCP " | " \fBall " }" \fB:\fIPRIO\fR + +.ti -8 +.IR PCP-MAP " := [ " PCP-MAP " ] " PCP-MAPPING + +.ti -8 +.IR PCP-MAPPING " := " PCP\fB:\fIPRIO\fR + +.ti -8 +.IR ET " := { " \fB0x600\fR " .. " \fB0xffff\fR " }" + +.ti -8 +.IR PORT " := { " \fB1\fR " .. " \fB65535\fR " }" + +.ti -8 +.IR DSCP " := { " \fB0\fR " .. " \fB63\fR " }" + +.ti -8 +.IR PCP " := { " \fB0(nd/de)\fR " .. " \fB7(nd/de)\fR " }" + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.SH DESCRIPTION + +.B dcb app +is used to configure APP table, or application priority table in the DCB (Data +Center Bridging) subsystem. The APP table is used to assign priority to traffic +based on value in one of several headers: EtherType, L4 destination port, or +DSCP. It also allows configuration of port-default priority that is chosen if no +other prioritization rule applies. + +DCB APP entries are 3-tuples of selector, protocol ID, and priority. Selector is +an enumeration that picks one of the prioritization namespaces. Currently it +mostly corresponds to configurable parameters described below. Protocol ID is a +value in the selector namespace. E.g. for EtherType selector, protocol IDs are +the individual EtherTypes, for DSCP they are individual code points. The +priority is the priority that should be assigned to traffic that matches the +selector and protocol ID. + +The APP table is a set of DCB APP entries. The only requirement is that +duplicate entries are not added. Notably, it is valid to have conflicting +priority assignment for the same selector and protocol ID. For example, the set +of two APP entries (DSCP, 10, 1) and (DSCP, 10, 2), where packets with DSCP of +10 should get priority of both 1 and 2, form a well-defined APP table. The +.B dcb app +tool allows low-level management of the app table by adding and deleting +individual APP 3-tuples through +.B add +and +.B del +commands. On the other hand, the command +.B replace +does what one would typically want in this situation--first adds the new +configuration, and then removes the obsolete one, so that only one +prioritization is in effect for a given selector and protocol ID. + +.SH COMMANDS + +.TP +.B show +Display all entries with a given selector. When no selector is given, shows all +APP table entries categorized per selector. + +.TP +.B flush +Remove all entries with a given selector. When no selector is given, removes all +APP table entries. + +.TP +.B add +.TQ +.B del +Add and, respectively, remove individual APP 3-tuples to and from the DCB APP +table. + +.TP +.B replace +Take the list of entries mentioned as parameter, and add those that are not +present in the APP table yet. Then remove those entries, whose selector and +protocol ID have been mentioned as parameter, but not with the exact same +priority. This has the effect of, for the given selector and protocol ID, +causing that the table only contains the priority (or priorities) given as +parameter. + +.SH PARAMETERS + +The following table shows parameters in a way that they would be used with +\fBadd\fR, \fBdel\fR and \fBreplace\fR commands. For \fBshow\fR and \fBflush\fR, +the parameter name is to be used as a simple keyword without further arguments. + +.TP +.B default-prio \fIPRIO-LIST +The priority to be used for traffic the priority of which is otherwise +unspecified. The argument is a list of individual priorities. Note that +.B default-prio +rules are configured as triplets (\fBEtherType\fR, \fB0\fR, \fIPRIO\fR). +.B dcb app +translates these rules to the symbolic name +.B default-prio +and back. + +.TP +.B ethtype-prio \fIET-MAP +\fIET-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are EtherType values. Values are priorities to be assigned to +traffic with the matching EtherType. + +.TP +.B stream-port-prio \fIPORT-MAP +.TQ +.B dgram-port-prio \fIPORT-MAP +.TQ +.B port-prio \fIPORT-MAP +\fIPORT-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are L4 destination port numbers that match on, respectively, +TCP and SCTP traffic, UDP and DCCP traffic, and either of those. Values are +priorities that should be assigned to matching traffic. + +.TP +.B dscp-prio \fIDSCP-MAP +\fIDSCP-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are DSCP points, values are priorities assigned to +traffic with matching DSCP. DSCP points can be written either directly as +numeric values, or using symbolic names specified in +.B /etc/iproute2/rt_dsfield +(however note that the file specifies full 8-bit dsfield values, whereas +.B dcb app +will only use the higher six bits). +.B dcb app show +will similarly format DSCP values as symbolic names if possible. The +command line option +.B -N +turns the show translation off. + +.TP +.B pcp-prio \fIPCP-MAP +\fIPCP-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are PCP/DEI. Values are priorities assigned to traffic with +matching PCP/DEI. PCP/DEI values are written as a combination of numeric- and +symbolic values, to accommodate for both. PCP always in numerical form e.g +0 .. 7 and DEI in symbolic form e.g 'de' (drop-eligible), indicating that the +DEI bit is 1 or 'nd' (not-drop-eligible), indicating that the DEI bit is 0. +In combination 2de:1 translates to a mapping of PCP=2 and DEI=1 to priority 1. + +.SH EXAMPLE & USAGE + +Prioritize traffic with DSCP 0 to priority 0, 24 to 3 and 48 to 6: + +.P +# dcb app add dev eth0 dscp-prio 0:0 24:3 48:6 + +Add another rule to configure DSCP 24 to priority 2 and show the result: + +.P +# dcb app add dev eth0 dscp-prio 24:2 +.br +# dcb app show dev eth0 dscp-prio +.br +dscp-prio 0:0 CS3:2 CS3:3 CS6:6 +.br +# dcb -N app show dev eth0 dscp-prio +.br +dscp-prio 0:0 24:2 24:3 48:6 + +Reconfigure the table so that the only rule for DSCP 24 is for assignment of +priority 4: + +.P +# dcb app replace dev eth0 dscp-prio 24:4 +.br +# dcb app -N show dev eth0 dscp-prio +.br +dscp-prio 0:0 24:4 48:6 + +Flush all DSCP rules: + +.P +# dcb app flush dev eth0 dscp-prio +.br +# dcb app show dev eth0 dscp-prio +.br +(nothing) + +Add a rule to map traffic with PCP 1 and DEI 0 to priority 1 and PCP 2 and DEI 1 +to priority 2: + +.P +# dcb app add dev eth0 pcp-prio 1nd:1 2de:2 +.br +# dcb app show dev eth0 pcp-prio +.br +pcp-prio 1nd:1 2de:2 + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-apptrust.8 b/man/man8/dcb-apptrust.8 new file mode 100644 index 0000000..ddc2133 --- /dev/null +++ b/man/man8/dcb-apptrust.8 @@ -0,0 +1,110 @@ +.TH DCB-APPTRUST 8 "22 November 2022" "iproute2" "Linux" +.SH NAME +dcb-apptrust \- show / configure per-selector trust and trust order of the +application priority table of the DCB (Data Center Bridging) subsystem. +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B apptrust +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb apptrust show dev +.RI DEV +.RB "[ " order " ]" + +.ti -8 +.B dcb apptrust set dev +.RI DEV +.RB "[ " order " +.IR "SEL-LIST" " ]" + +.ti -8 +.IR SEL-LIST " := [ " SEL-LIST " ] " SEL + +.ti -8 +.IR SEL " := { " ethtype " | " stream-port " | " dgram-port " | " port " | " dscp " | " pcp " } " + +.SH DESCRIPTION + +.B dcb apptrust +is used to configure per-selector trust and trust order of the +Application Priority Table, see +.BR dcb-app (8) +for details on how to configure app table entries. + +Selector trust can be used by the +software stack, or drivers (most likely the latter), when querying the APP +table, to determine if an APP entry should take effect, or not. Additionally, the +order of the trusted selectors will dictate which selector should take +precedence, in the case of multiple different APP table selectors being present. + +.SH COMMANDS + +.TP +.B show +Display all trusted selectors. + +.TP +.B set +Set new list of trusted selectors. Empty list is effectively the same as +removing trust entirely. + +.SH PARAMETERS + +The following describes only the write direction, i.e. as used with the +\fBset\fR command. For the \fBshow\fR command, the parameter name is to be used +as a simple keyword without further arguments. This instructs the tool to show +the values of a given parameter. + +.TP +.B order \fISEL-LIST +\fISEL-LIST\fR is a space-separated list of selector names. Possible selector +values are: +.B ethtype, +.B stream-port, +.B dgram-port, +.B port, +.B dscp, +and +.B pcp + + +.SH EXAMPLE & USAGE + +Set trust order to: dscp, pcp for eth0: +.P +# dcb apptrust set dev eth0 order dscp pcp + +Set trust order to: port (stream or dgram), pcp, ethtype for eth1: +.P +# dcb apptrust set dev eth1 order port pcp ethtype + +Show what was set: + +.P +# dcb apptrust show dev eth0 +.br +order: port pcp ethtype + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8), +.BR dcb-app (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Daniel Machon <daniel.machon@microchip.com> diff --git a/man/man8/dcb-buffer.8 b/man/man8/dcb-buffer.8 new file mode 100644 index 0000000..c7ba6a9 --- /dev/null +++ b/man/man8/dcb-buffer.8 @@ -0,0 +1,126 @@ +.TH DCB-BUFFER 8 "12 November 2020" "iproute2" "Linux" +.SH NAME +dcb-buffer \- show / manipulate port buffer settings of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B buffer +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb buffer show dev +.RI DEV +.RB "[ " prio-buffer " ]" +.RB "[ " buffer-size " ]" +.RB "[ " total-size " ]" + +.ti -8 +.B dcb buffer set dev +.RI DEV +.RB "[ " prio-buffer " " \fIPRIO-MAP " ]" +.RB "[ " buffer-size " " \fISIZE-MAP " ]" + +.ti -8 +.IR PRIO-MAP " := [ " PRIO-MAP " ] " PRIO-MAPPING + +.ti -8 +.IR PRIO-MAPPING " := { " PRIO " | " \fBall " }" \fB:\fIBUFFER\fR + +.ti -8 +.IR SIZE-MAP " := [ " SIZE-MAP " ] " SIZE-MAPPING + +.ti -8 +.IR SIZE-MAPPING " := { " BUFFER " | " \fBall " }" \fB:\fISIZE\fR + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.ti -8 +.IR BUFFER " := { " \fB0\fR " .. " \fB7\fR " }" + +.ti -8 +.IR SIZE " := { " INTEGER " | " INTEGER\fBK\fR " | " INTEGER\fBM\fR " | " ... " }" + +.SH DESCRIPTION + +.B dcb buffer +is used to configure assignment of traffic to port buffers based on traffic +priority, and sizes of those buffers. It can be also used to inspect the current +configuration, as well as total device memory that the port buffers take. + +.SH PARAMETERS + +For read-write parameters, the following describes only the write direction, +i.e. as used with the \fBset\fR command. For the \fBshow\fR command, the +parameter name is to be used as a simple keyword without further arguments. This +instructs the tool to show the value of a given parameter. When no parameters +are given, the tool shows the complete buffer configuration. + +.TP +.B total-size +A read-only property that shows the total device memory taken up by port +buffers. This might be more than a simple sum of individual buffer sizes if +there are any hidden or internal buffers. + +.TP +.B prio-buffer \fIPRIO-MAP +\fIPRIO-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are priorities, values are buffer indices. For each priority +sets a buffer where traffic with that priority is directed to. + +.TP +.B buffer-size \fISIZE-MAP +\fISIZE-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are buffer indices, values are sizes of that buffer in bytes. +The sizes can use the notation documented in section PARAMETERS at +.BR tc (8). +Note that the size requested by the tool can be rounded or capped by the driver +to satisfy the requirements of the device. + +.SH EXAMPLE & USAGE + +Configure the priomap in a one-to-one fashion: + +.P +# dcb buffer set dev eth0 prio-buffer 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 + +Set sizes of all buffers to 10KB, except for buffer 6, which will have the size +1MB: + +.P +# dcb buffer set dev eth0 buffer-size all:10K 6:1M + +Show what was set: + +.P +# dcb buffer show dev eth0 +.br +prio-buffer 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 +.br +buffer-size 0:10Kb 1:10Kb 2:10Kb 3:10Kb 4:10Kb 5:10Kb 6:1Mb 7:10Kb +.br +total-size 1222Kb + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-dcbx.8 b/man/man8/dcb-dcbx.8 new file mode 100644 index 0000000..bafc18f --- /dev/null +++ b/man/man8/dcb-dcbx.8 @@ -0,0 +1,108 @@ +.TH DCB-DCBX 8 "13 December 2020" "iproute2" "Linux" +.SH NAME +dcb-dcbx \- show / manipulate port DCBX (Data Center Bridging eXchange) +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B dcbx +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb dcbx show dev +.RI DEV + +.ti -8 +.B dcb dcbx set dev +.RI DEV +.RB "[ " host " ]" +.RB "[ " lld-managed " ]" +.RB "[ " cee " ]" +.RB "[ " ieee " ]" +.RB "[ " static " ]" + +.SH DESCRIPTION + +Data Center Bridging eXchange (DCBX) is a protocol used by DCB devices to +exchange configuration information with directly connected peers. The Linux DCBX +object is a 1-byte bitfield of flags that configure whether DCBX is implemented +in the device or in the host, and which version of the protocol should be used. +.B dcb dcbx +is used to access the per-port Linux DCBX object. + +There are two principal modes of operation: in +.B host +mode, DCBX protocol is implemented by the host LLDP agent, and the DCB +interfaces are used to propagate the negotiate parameters to capable devices. In +.B lld-managed +mode, the configuration is handled by the device, and DCB interfaces are used +for inspection of negotiated parameters, and can also be used to set initial +parameters. + +.SH PARAMETERS + +When used with +.B dcb dcbx set, +the following keywords enable the corresponding configuration. The keywords that +are not mentioned on the command line are considered disabled. When used with +.B show, +each enabled feature is shown by its corresponding keyword. + +.TP +.B host +.TQ +.B lld-managed +The device is in the host mode of operation and, respectively, the lld-managed +mode of operation, as described above. In principle these two keywords are +mutually exclusive, but +.B dcb dcbx +allows setting both and lets the driver handle it as appropriate. + +.TP +.B cee +.TQ +.B ieee +The device supports CEE (Converged Enhanced Ethernet) and, respectively, IEEE +version of the DCB specification. Typically only one of these will be set, but +.B dcb dcbx +does not mandate this. + +.TP +.B static +indicates the engine supports static configuration. No actual negotiation is +performed, negotiated parameters are always the initial configuration. + +.SH EXAMPLE & USAGE + +Put the DCB engine into the "host" mode of operation, and use IEEE-standardized +DCB interfaces: + +.P +# dcb dcbx set dev eth0 host ieee + +Show what was set: + +.P +# dcb dcbx show dev eth0 +.br +host ieee + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-ets.8 b/man/man8/dcb-ets.8 new file mode 100644 index 0000000..9c64b33 --- /dev/null +++ b/man/man8/dcb-ets.8 @@ -0,0 +1,194 @@ +.TH DCB-ETS 8 "19 October 2020" "iproute2" "Linux" +.SH NAME +dcb-ets \- show / manipulate ETS (Enhanced Transmission Selection) settings of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B ets +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb ets show dev +.RI DEV +.RB "[ " willing " ]" +.RB "[ " ets-cap " ]" +.RB "[ " cbs " ]" +.RB "[ " tc-tsa " ]" +.RB "[ " reco-tc-tsa " ]" +.RB "[ " pg-bw " ]" +.RB "[ " tc-bw " ]" +.RB "[ " reco-tc-bw " ]" +.RB "[ " prio-tc " ]" +.RB "[ " reco-prio-tc " ]" + +.ti -8 +.B dcb ets set dev +.RI DEV +.RB "[ " willing " { " on " | " off " } ]" +.RB "[ { " tc-tsa " | " reco-tc-tsa " } " \fITSA-MAP\fB " ]" +.RB "[ { " pg-bw " | " tc-bw " | " reco-tc-bw " } " \fIBW-MAP\fB " ]" +.RB "[ { " prio-tc " | " reco-prio-tc " } " \fIPRIO-MAP\fB " ]" + +.ti -8 +.IR TSA-MAP " := [ " TSA-MAP " ] " TSA-MAPPING + +.ti -8 +.IR TSA-MAPPING " := { " TC " | " \fBall " }" \fB: "{ " \fBstrict\fR " | " +.IR \fBcbs\fR " | " \fBets\fR " | " \fBvendor\fR " }" + +.ti -8 +.IR BW-MAP " := [ " BW-MAP " ] " BW-MAPPING + +.ti -8 +.IR BW-MAPPING " := { " TC " | " \fBall " }" \fB:\fIINTEGER\fR + +.ti -8 +.IR PRIO-MAP " := [ " PRIO-MAP " ] " PRIO-MAPPING + +.ti -8 +.IR PRIO-MAPPING " := { " PRIO " | " \fBall " }" \fB:\fITC\fR + +.ti -8 +.IR TC " := { " \fB0\fR " .. " \fB7\fR " }" + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.SH DESCRIPTION + +.B dcb ets +is used to configure Enhanced Transmission Selection attributes through Linux +DCB (Data Center Bridging) interface. ETS permits configuration of mapping of +priorities to traffic classes, traffic selection algorithm to use per traffic +class, bandwidth allocation, etc. + +Two DCB TLVs are related to the ETS feature: a configuration and recommendation +values. Recommendation values are named with a prefix +.B reco-, +while the configuration ones have plain names. + +.SH PARAMETERS + +For read-write parameters, the following describes only the write direction, +i.e. as used with the \fBset\fR command. For the \fBshow\fR command, the +parameter name is to be used as a simple keyword without further arguments. This +instructs the tool to show the value of a given parameter. When no parameters +are given, the tool shows the complete ETS configuration. + +.TP +.B ets-cap +A read-only property that shows the number of supported ETS traffic classes. + +.TP +.B cbs +A read-only property that is enabled if the driver and the hardware support the +CBS Transmission Selection Algorithm. + +.TP +.B willing \fR{ \fBon\fR | \fBoff\fR } +Whether local host should accept configuration from peer TLVs. + +.TP +.B prio-tc \fIPRIO-MAP +.TQ +.B reco-prio-tc \fIPRIO-MAP +\fIPRIO-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are priorities, values are traffic classes. For each priority +sets a TC where traffic with that priority is directed to. + +.TP +.B tc-tsa \fITSA-MAP +.TQ +.B reco-tc-tsa \fITSA-MAP +\fITSA-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are TCs, values are Transmission Selection Algorithm (TSA) +keywords described below. For each TC sets an algorithm used for deciding how +traffic queued up at this TC is scheduled for transmission. Supported TSAs are: + +.B strict +- for strict priority, where traffic in higher-numbered TCs always takes +precedence over traffic in lower-numbered TCs. +.br +.B ets +- for Enhanced Traffic Selection, where available bandwidth is distributed among +the ETS-enabled TCs according to the weights set by +.B tc-bw +and +.B reco-tc-bw\fR, +respectively. +.br +.B cbs +- for Credit Based Shaper, where traffic is scheduled in a strict manner up to +the limit set by a shaper. +.br +.B vendor +- for vendor-specific traffic selection algorithm. + +.TP +.B tc-bw \fIBW-MAP +.TQ +.B reco-tc-bw \fIBW-MAP +\fIBW-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are TCs, values are integers representing percent of available +bandwidth given to the traffic class in question. The value should be 0 for TCs +whose TSA is not \fBets\fR, and the sum of all values shall be 100. As an +exception to the standard wording, a configuration with no \fBets\fR TCs is +permitted to sum up to 0 instead. +.br + +.TP +.B pg-bw \fIBW-MAP +The precise meaning of \fBpg-bw\fR is not standardized, but the assumption seems +to be that the same scheduling process as on the transmit side is applicable on +receive side as well, and configures receive bandwidth allocation for \fBets\fR +ingress traffic classes (priority groups). + +.SH EXAMPLE & USAGE + +Configure ETS priomap in a one-to-one fashion: + +.P +# dcb ets set dev eth0 prio-tc 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 + +Set TSA and transmit bandwidth configuration: + +.P +# dcb ets set dev eth0 tc-tsa all:strict 0:ets 1:ets 2:ets \\ +.br + tc-bw all:0 0:33 1:33 2:34 + +Show what was set: + +.P +# dcb ets show dev eth0 prio-tc tc-tsa tc-bw +.br +prio-tc 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 +.br +tc-tsa 0:ets 1:ets 2:ets 3:strict 4:strict 5:strict 6:strict 7:strict +.br +tc-bw 0:33 1:33 2:34 3:0 4:0 5:0 6:0 7:0 + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-maxrate.8 b/man/man8/dcb-maxrate.8 new file mode 100644 index 0000000..d03c215 --- /dev/null +++ b/man/man8/dcb-maxrate.8 @@ -0,0 +1,94 @@ +.TH DCB-MAXRATE 8 "22 November 2020" "iproute2" "Linux" +.SH NAME +dcb-maxrate \- show / manipulate port maxrate settings of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B maxrate +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb maxrate show dev +.RI DEV +.RB "[ " tc-maxrate " ]" + +.ti -8 +.B dcb maxrate set dev +.RI DEV +.RB "[ " tc-maxrate " " \fIRATE-MAP " ]" + +.ti -8 +.IR RATE-MAP " := [ " RATE-MAP " ] " RATE-MAPPING + +.ti -8 +.IR RATE-MAPPING " := { " TC " | " \fBall " }" \fB:\fIRATE\fR + +.ti -8 +.IR TC " := { " \fB0\fR " .. " \fB7\fR " }" + +.ti -8 +.IR RATE " := { " INTEGER "[" \fBbit\fR "] | " INTEGER\fBKbit\fR " | " +.IR INTEGER\fBMib\fR " | " ... " }" + +.SH DESCRIPTION + +.B dcb maxrate +is used to configure and inspect maximum rate at which traffic is allowed to +egress from a given traffic class. + +.SH PARAMETERS + +The following describes only the write direction, i.e. as used with the +\fBset\fR command. For the \fBshow\fR command, the parameter name is to be used +as a simple keyword without further arguments. This instructs the tool to show +the value of a given parameter. When no parameters are given, the tool shows the +complete maxrate configuration. + +.TP +.B tc-maxrate \fIRATE-MAP +\fIRATE-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are TC indices, values are traffic rates in bits per second. +The rates can use the notation documented in section PARAMETERS at +.BR tc (8). +Note that under that notation, "bit" stands for bits per second whereas "b" +stands for bytes per second. When showing, the command line option +.B -i +toggles between using decadic and ISO/IEC prefixes. + +.SH EXAMPLE & USAGE + +Set rates of all traffic classes to 25Gbps, except for TC 6, which will +have the rate of 100Gbps: + +.P +# dcb maxrate set dev eth0 tc-maxrate all:25Gbit 6:100Gbit + +Show what was set: + +.P +# dcb maxrate show dev eth0 +.br +tc-maxrate 0:25Gbit 1:25Gbit 2:25Gbit 3:25Gbit 4:25Gbit 5:25Gbit 6:100Gbit 7:25Gbit + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-pfc.8 b/man/man8/dcb-pfc.8 new file mode 100644 index 0000000..735c16e --- /dev/null +++ b/man/man8/dcb-pfc.8 @@ -0,0 +1,127 @@ +.TH DCB-PFC 8 "31 October 2020" "iproute2" "Linux" +.SH NAME +dcb-pfc \- show / manipulate PFC (Priority-based Flow Control) settings of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B pfc +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb pfc show dev +.RI DEV +.RB "[ " pfc-cap " ]" +.RB "[ " prio-pfc " ]" +.RB "[ " macsec-bypass " ]" +.RB "[ " delay " ]" +.RB "[ " requests " ]" +.RB "[ " indications " ]" + +.ti -8 +.B dcb pfc set dev +.RI DEV +.RB "[ " prio-pfc " " \fIPFC-MAP " ]" +.RB "[ " macsec-bypass " { " on " | " off " } ]" +.RB "[ " delay " " \fIINTEGER\fR " ]" + +.ti -8 +.IR PFC-MAP " := [ " PFC-MAP " ] " PFC-MAPPING + +.ti -8 +.IR PFC-MAPPING " := { " PRIO " | " \fBall " }" \fB:\fR "{ " +.IR \fBon\fR " | " \fBoff\fR " }" + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.SH DESCRIPTION + +.B dcb pfc +is used to configure Priority-based Flow Control attributes through Linux +DCB (Data Center Bridging) interface. PFC permits marking flows with a +certain priority as lossless, and holds related configuration, as well as +PFC counters. + +.SH PARAMETERS + +For read-write parameters, the following describes only the write direction, +i.e. as used with the \fBset\fR command. For the \fBshow\fR command, the +parameter name is to be used as a simple keyword without further arguments. This +instructs the tool to show the value of a given parameter. When no parameters +are given, the tool shows the complete PFC configuration. + +.TP +.B pfc-cap +A read-only property that shows the number of traffic classes that may +simultaneously support PFC. + +.TP +.B requests +A read-only count of the sent PFC frames per traffic class. Only shown when +-s is given, or when requested explicitly. + +.TP +.B indications +A read-only count of the received PFC frames per traffic class. Only shown +when -s is given, or when requested explicitly. + +.TP +.B macsec-bypass \fR{ \fBon\fR | \fBoff\fR } +Whether the sending station is capable of bypassing MACsec processing when +MACsec is disabled. + +.TP +.B prio-pfc \fIPFC-MAP +\fIPFC-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are priorities, values are on / off indicators of whether +PFC is enabled for a given priority. + +.TP +.B delay \fIINTEGER +The allowance made for round-trip propagation delay of the link in bits. +The value shall be 0..65535. + +.SH EXAMPLE & USAGE + +Enable PFC on priorities 6 and 7, leaving the rest intact: + +.P +# dcb pfc set dev eth0 prio-pfc 6:on 7:on + +Disable PFC of all priorities except 6 and 7, and configure delay to 4096 +bits: + +.P +# dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on delay 0x1000 + +Show what was set: + +.P +# dcb pfc show dev eth0 +.br +pfc-cap 8 macsec-bypass off delay 4096 +.br +prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/dcb-rewr.8 b/man/man8/dcb-rewr.8 new file mode 100644 index 0000000..03b59cf --- /dev/null +++ b/man/man8/dcb-rewr.8 @@ -0,0 +1,206 @@ +.TH DCB-REWR 8 "15 may 2023" "iproute2" "Linux" +.SH NAME +dcb-rewr \- show / manipulate the rewrite table of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B rewr +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb rewr " { " show " | " flush " } " dev +.RI DEV +.RB "[ " prio-dscp " ]" +.RB "[ " prio-pcp " ]" + +.ti -8 +.B dcb rewr " { " add " | " del " | " replace " } " dev +.RI DEV +.RB "[ " prio-dscp " " \fIDSCP-MAP\fB " ]" +.RB "[ " prio-pcp " " \fIPCP-MAP\fB " ]" + +.ti -8 +.IR DSCP-MAP " := [ " DSCP-MAP " ] " DSCP-MAPPING + +.ti -8 +.IR DSCP-MAPPING " := " \fIPRIO \fB:\fR "{ " DSCP " | " \fBall\fR " }" + +.ti -8 +.IR PCP-MAP " := [ " PCP-MAP " ] " PCP-MAPPING + +.ti -8 +.IR PCP-MAPPING " := " \fIPRIO \fB:\fR PCP\fR + +.ti -8 +.IR DSCP " := { " \fB0\fR " .. " \fB63\fR " }" + +.ti -8 +.IR PCP " := { " \fB0(nd/de)\fR " .. " \fB7(nd/de)\fR " }" + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.SH DESCRIPTION + +.B dcb rewr +is used to configure the rewrite table, in the DCB (Data Center Bridging) +subsystem. The rewrite table is used to rewrite certain values in the packet +headers, based on packet priority. + +DCB rewrite entries are, like DCB APP entries, 3-tuples of selector, protocol +ID, and priority. Selector is an enumeration that picks one of the +prioritization namespaces. Currently, only the DSCP and PCP selector namespaces +are supported by dcb rewr. + +The rewrite table is a list of DCB rewrite rules, that applies to packets +with matching priority. Notably, it is valid to have conflicting rewrite +assignment for the same selector and priority. For example, the set of two +rewrite entries (DSCP, 10, 1) and (DSCP, 11, 1), where packets with priority 1 +should have its DSCP value rewritten to both 10 and 11, form a well-defined +rewrite table. +.B dcb rewr +tool allows low-level management of the rewrite table by adding and deleting +individual rewrite 3-tuples through +.B add +and +.B del +commands. On the other hand, the command +.B replace +does what one would typically want in this situation--first adds the new +configuration, and then removes the obsolete one, so that only one +rewrite rule is in effect for a given selector and priority. + +.SH COMMANDS + +.TP +.B show +Display all entries with a given selector. When no selector is given, shows all +rewrite table entries categorized per selector. + +.TP +.B flush +Remove all entries with a given selector. When no selector is given, removes all +rewrite table entries. + +.TP +.B add +.TQ +.B del +Add and, respectively, remove individual rewrite 3-tuples to and from the DCB +rewrite table. + +.TP +.B replace +Take the list of entries mentioned as parameter, and add those that are not +present in the rewrite table yet. Then remove those entries, whose selector and +priority have been mentioned as parameter, but not with the exact same +protocol ID. This has the effect of, for the given selector and priority, +causing that the table only contains the protocol ID (or ID's) given as +parameter. + +.SH PARAMETERS + +The following table shows parameters in a way that they would be used with +\fBadd\fR, \fBdel\fR and \fBreplace\fR commands. For \fBshow\fR and +\fBflush\fR, the parameter name is to be used as a simple keyword without +further arguments. + +.TP +.B prio-dscp \fIDSCP-MAP +\fIDSCP-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are priorities, values are DSCP points for traffic +with matching priority. DSCP points can be written either directly as numeric +values, or using symbolic names specified in +.B /etc/iproute2/rt_dsfield +(however note that the file specifies full 8-bit dsfield values, whereas +.B dcb rewr +will only use the higher six bits). +.B dcb rewr show +will similarly format DSCP values as symbolic names if possible. The +command line option +.B -N +turns the show translation off. + +.TP +.B prio-pcp \fIPCP-MAP +\fIPCP-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are priorities. Values are PCP/DEI for traffic with +matching priority. PCP/DEI values are written as a combination of numeric- and +symbolic values, to accommodate for both. PCP always in numeric form e.g 0 .. +7 and DEI in symbolic form e.g 'de' (drop-eligible), indicating that the DEI +bit is 1 or 'nd' (not-drop-eligible), indicating that the DEI bit is 0. In +combination 1:2de translates to a mapping of priority 1 to PCP=2 and DEI=1. + +.SH EXAMPLE & USAGE + +Add a rule to rewrite DSCP to 0, 24 and 48 for traffic with priority 0, 3 and +6, respectively: +.P +# dcb rewr add dev eth0 prio-dscp 0:0 3:24 6:48 + +Add a rule to rewrite DSCP to 25 for traffic with priority 3: +.P +# dcb rewr add dev eth0 prio-dscp 3:25 +.br +# dcb rewr show dev eth0 prio-dscp +.br +prio-dscp 0:0 3:CS3 3:25 6:CS6 +.br +# dcb -N rewr show dev eth0 prio-dscp +.br +prio-dscp 0:0 3:24 3:25 6:48 + +Reconfigure the table so that only one rule exists for rewriting traffic with +priority 3. + +.P +# dcb rewr replace dev eth0 prio-dscp 3:26 +.br +# dcb rewr -N show dev eth0 prio-dscp +.br +prio-dscp 0:0 3:26 6:48 + +Flush all DSCP rules: + +.P +# dcb rewr flush dev eth0 prio-dscp +.br +# dcb rewr show dev eth0 prio-dscp +.br +(nothing) + +Add a rule to rewrite PCP to 1 and DEI to 0 for traffic with priority 1 and a +rule to rewrite PCP to 2 and DEI to 1 for traffic with priority 2: + +.P +# dcb rewr add dev eth0 prio-pcp 1:1nd 2:2de +.br +# dcb rewr show dev eth0 prio-pcp +.br +prio-pcp 1:1nd 2:2de + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) +.BR dcb-app (8) +.BR dcb-apptrust (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. You do not have to be +subscribed to the list to send a message there. + +.SH AUTHOR +Daniel Machon <daniel.machon@microchip.com> diff --git a/man/man8/dcb.8 b/man/man8/dcb.8 new file mode 100644 index 0000000..a1d6505 --- /dev/null +++ b/man/man8/dcb.8 @@ -0,0 +1,158 @@ +.TH DCB 8 "19 October 2020" "iproute2" "Linux" +.SH NAME +dcb \- show / manipulate DCB (Data Center Bridging) settings +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.RB "{ " app " | " buffer " | " ets " | " maxrate " | " pfc " }" +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb +.RB "[ " -force " ] " +.BI "-batch " filename +.sp + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B help +.sp + +.SH OPTIONS + +.TP +.BR "\-n" , " \--netns " <NETNS> +switches +.B dcb +to the specified network namespace +.IR NETNS . + +.TP +.BR "\-V" , " --Version" +Print the version of the +.B dcb +utility and exit. + +.TP +.BR "\-b", " --batch " <FILENAME> +Read commands from provided file or standard input and invoke them. First +failure will cause termination of dcb. + +.TP +.BR "\-f", " --force" +Don't terminate dcb on errors in batch mode. If there were any errors during +execution of the commands, the application return code will be non zero. + +.TP +.BR "\-i" , " --iec" +When showing rates, use ISO/IEC 1024-based prefixes (Ki, Mi, Bi) instead of +the 1000-based ones (K, M, B). + +.TP +.BR "\-j" , " --json" +Generate JSON output. + +.TP +.BR "\-N" , " --Numeric" +If the subtool in question translates numbers to symbolic names in some way, +suppress this translation. + +.TP +.BR "\-p" , " --pretty" +When combined with -j generate a pretty JSON output. + +.TP +.BR "\-s" , " --statistics" +If the object in question contains any statistical counters, shown them as +part of the "show" output. + +.SH OBJECTS + +.TP +.B app +- Configuration of application priority table + +.TP +.B buffer +- Configuration of port buffers + +.TP +.B ets +- Configuration of ETS (Enhanced Transmission Selection) + +.TP +.B maxrate +- Configuration of per-TC maximum transmit rate + +.TP +.B pfc +- Configuration of PFC (Priority-based Flow Control) + +.SH COMMANDS + +A \fICOMMAND\fR specifies the action to perform on the object. The set of +possible actions depends on the object type. As a rule, it is possible to +.B show +objects and to invoke topical +.B help, +which prints a list of available commands and argument syntax conventions. + +.SH ARRAY PARAMETERS + +Like commands, specification of parameters is in the domain of individual +objects (and their commands) as well. However, much of the DCB interface +revolves around arrays of fixed size that specify one value per some key, such +as per traffic class or per priority. There is therefore a single syntax for +adjusting elements of these arrays. It consists of a series of +\fIKEY\fB:\fIVALUE\fR pairs, where the meaning of the individual keys and values +depends on the parameter. + +The elements are evaluated in order from left to right, and the latter ones +override the earlier ones. The elements that are not specified on the command +line are queried from the kernel and their current value is retained. + +As an example, take a made-up parameter tc-juju, which can be set to charm +traffic in a given TC with either good luck or bad luck. \fIKEY\fR can therefore +be 0..7 (as is usual for TC numbers in DCB), and \fIVALUE\fR either of +\fBnone\fR, \fBgood\fR, and \fBbad\fR. An example of changing a juju value of +TCs 0 and 7, while leaving all other intact, would then be: + +.P +# dcb foo set dev eth0 tc-juju 0:good 7:bad + +A special key, \fBall\fR, is recognized which sets the same value to all array +elements. This can be combined with the usual single-element syntax. E.g. in the +following, the juju of all keys is set to \fBnone\fR, except 0 and 7, which have +other values: + +.P +# dcb foo set dev eth0 tc-juju all:none 0:good 7:bad + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb-app (8), +.BR dcb-apptrust (8), +.BR dcb-buffer (8), +.BR dcb-ets (8), +.BR dcb-maxrate (8), +.BR dcb-pfc (8), +.BR dcb-rewr (8) +.br + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata <me@pmachata.org> diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8 new file mode 100644 index 0000000..e9d091d --- /dev/null +++ b/man/man8/devlink-dev.8 @@ -0,0 +1,354 @@ +.TH DEVLINK\-DEV 8 "14 Mar 2016" "iproute2" "Linux" +.SH NAME +devlink-dev \- devlink device configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B dev +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-n\fR[\fIno-nice-names\fR] } + +.ti -8 +.B devlink dev show +.RI "[ " DEV " ]" + +.ti -8 +.B devlink dev help + +.ti -8 +.B devlink dev eswitch set +.I DEV +[ +.BR mode " { " legacy " | " switchdev " } " +] [ +.BR inline-mode " { " none " | " link " | " network " | " transport " } " +] [ +.BR encap-mode " { " none " | " basic " } " +] + +.ti -8 +.B devlink dev eswitch show +.I DEV + +.ti -8 +.B devlink dev param set +.I DEV +.B name +.I PARAMETER +.B value +.I VALUE +.BR cmode " { " runtime " | " driverinit " | " permanent " } " + +.ti -8 +.B devlink dev param show +[ +.I DEV +.B name +.I PARAMETER +] + +.ti -8 +.B devlink dev reload +.I DEV +[ +.B netns +.RI "{ " PID " | " NAME " | " ID " }" +] [ +.BR action " { " driver_reinit " | " fw_activate " }" +] [ +.B limit no_reset +] + +.ti -8 +.B devlink dev info +[ +.I DEV +] + +.ti -8 +.B devlink dev flash +.I DEV +.B file +.I PATH +[ +.B target +.I ID +] + +.ti -8 +.B devlink dev selftests show +[ +.I DEV +] + +.ti -8 +.B devlink dev selftests run +.I DEV +[ +.B id +.I ID... +] + +.SH "DESCRIPTION" +.SS devlink dev show - display devlink device attributes + +.PP +.I "DEV" +- specifies the devlink device to show. +If this argument is omitted all devices are listed. + +.in +4 +Format is: +.in +2 +BUS_NAME/BUS_ADDRESS + +.SS devlink dev eswitch show - display devlink device eswitch attributes +.SS devlink dev eswitch set - sets devlink device eswitch attributes + +.TP +.BR mode " { " legacy " | " switchdev " } " +Set eswitch mode + +.I legacy +- Legacy SRIOV + +.I switchdev +- SRIOV switchdev offloads + +.TP +.BR inline-mode " { " none " | " link " | " network " | " transport " } " +Some HWs need the VF driver to put part of the packet headers on the TX descriptor so the e-switch can do proper matching and steering. + +.I none +- None + +.I link +- L2 mode + +.I network +- L3 mode + +.I transport +- L4 mode + +.TP +.BR encap-mode " { " none " | " basic " } " +Set eswitch encapsulation support + +.I none +- Disable encapsulation support + +.I basic +- Enable encapsulation support + +.SS devlink dev param set - set new value to devlink device configuration parameter + +.TP +.BI name " PARAMETER" +Specify parameter name to set. + +.TP +.BI value " VALUE" +New value to set. + +.TP +.BR cmode " { " runtime " | " driverinit " | " permanent " } " +Configuration mode in which the new value is set. + +.I runtime +- Set new value while driver is running. This configuration mode doesn't require any reset to apply the new value. + +.I driverinit +- Set new value which will be applied during driver initialization. This configuration mode requires restart driver by devlink reload command to apply the new value. + +.I permanent +- New value is written to device's non-volatile memory. This configuration mode requires hard reset to apply the new value. + +.SS devlink dev param show - display devlink device supported configuration parameters attributes + +.B name +.I PARAMETER +Specify parameter name to show. +If this argument is omitted all parameters supported by devlink devices are listed. + +.SS devlink dev reload - perform hot reload of the driver. + +.PP +.I "DEV" +- Specifies the devlink device to reload. + +.B netns +.RI { " PID " | " NAME " | " ID " } +- Specifies the network namespace to reload into, either by pid, name or id. + +.BR action " { " driver_reinit " | " fw_activate " }" +- Specifies the reload action required. +If this argument is omitted +.I driver_reinit +action will be used. +Note that even though user asks for a specific action, the driver implementation +might require to perform another action alongside with it. For example, some +driver do not support driver reinitialization being performed without fw +activation. Therefore, the devlink reload command returns the list of actions +which were actrually performed. + +.I driver_reinit +- Driver entities re-initialization, applying devlink-param and +devlink-resource values. + +.I fw_activate +- Activates new firmware if such image is stored and pending activation. If no +limitation specified this action may involve firmware reset. If no new image +pending this action will reload current firmware image. + +.B limit no_reset +- Specifies limitation on reload action. +If this argument is omitted limit is unspecified and the reload action is not +limited. In such case driver implementation may include reset or downtime as +needed to perform the actions. + +.I no_reset +- No reset allowed, no down time allowed, no link flap and no configuration is +lost. + +.SS devlink dev info - display device information. +Display device information provided by the driver. This command can be used +to query versions of the hardware components or device components which +can't be updated ( +.I fixed +) as well as device firmware which can be updated. For firmware components +.I running +displays the versions of firmware currently loaded into the device, while +.I stored +reports the versions in device's flash. +.I Running +and +.I stored +versions may differ after flash has been updated, but before reboot. + +.PP +.I "DEV" +- specifies the devlink device to show. +If this argument is omitted all devices are listed. + +.SS devlink dev flash - write device's non-volatile memory. + +.PP +.I "DEV" +- specifies the devlink device to write to. + +.B file +.I PATH +- Path to the file which will be written into device's flash. The path needs +to be relative to one of the directories searched by the kernel firmware loader, +such as /lib/firmware. + +.B component +.I NAME +- If device stores multiple firmware images in non-volatile memory, this +parameter may be used to indicate which firmware image should be written. +The value of +.I NAME +should match the component names from +.B "devlink dev info" +and may be driver-dependent. + +.SS devlink dev selftests show - shows supported selftests on devlink device. + +.PP +.I "DEV" +- specifies the devlink device. +If this argument is omitted all selftests for devlink devices are listed. + +.SS devlink dev selftests run - runs selftests on devlink device. + +.PP +.I "DEV" +- specifies the devlink device to execute selftests. + +.B id +.I ID... +- The value of +.I ID(s) +should match the selftests shown in +.B "devlink dev selftests show" +to execute selftests on the devlink device. +If this argument is omitted all selftests supported by devlink devices are executed. + +.SH "EXAMPLES" +.PP +devlink dev show +.RS 4 +Shows the state of all devlink devices on the system. +.RE +.PP +devlink dev show pci/0000:01:00.0 +.RS 4 +Shows the state of specified devlink device. +.RE +.PP +devlink dev eswitch show pci/0000:01:00.0 +.RS 4 +Shows the eswitch mode of specified devlink device. +.RE +.PP +devlink dev eswitch set pci/0000:01:00.0 mode switchdev +.RS 4 +Sets the eswitch mode of specified devlink device to switchdev. +.RE +.PP +devlink dev param show pci/0000:01:00.0 name max_macs +.RS 4 +Shows the parameter max_macs attributes. +.RE +.PP +devlink dev param set pci/0000:01:00.0 name internal_error_reset value true cmode runtime +.RS 4 +Sets the parameter internal_error_reset of specified devlink device to true. +.RE +.PP +devlink dev reload pci/0000:01:00.0 +.RS 4 +Performs hot reload of specified devlink device. +.RE +.PP +devlink dev flash pci/0000:01:00.0 file firmware.bin +.RS 4 +Flashes the specified devlink device with provided firmware file name. If the driver supports it, user gets updates about the flash status. For example: +.br +Preparing to flash +.br +Flashing 100% +.br +Flashing done +.RE +.PP +devlink dev selftests show pci/0000:01:00.0 +.RS 4 +Shows the supported selftests by the devlink device. +.RE +.PP +devlink dev selftests run pci/0000:01:00.0 id flash +.RS 4 +Perform a flash test on the devlink device. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-port (8), +.BR devlink-sb (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/devlink-dpipe.8 b/man/man8/devlink-dpipe.8 new file mode 100644 index 0000000..3a4d254 --- /dev/null +++ b/man/man8/devlink-dpipe.8 @@ -0,0 +1,99 @@ +.TH DEVLINK\-DPIPE 8 "4 Apr 2020" "iproute2" "Linux" +.SH NAME +devlink-dpipe \- devlink dataplane pipeline visualization +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B dpipe +.RB "{ " table " | " header " }" +.RI "{ " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] } + +.ti -8 +.BI "devlink dpipe table show " DEV +.RB "[ " name +.IR TABLE_NAME " ]" + +.ti -8 +.BI "devlink dpipe table set " DEV +.BI name " TABLE_NAME " + +.ti -8 +.BI "devlink dpipe table dump " DEV +.BI name " TABLE_NAME " + +.ti -8 +.BI "devlink dpipe header show " DEV + +.ti -8 +.B devlink dpipe help + +.SH "DESCRIPTION" +.SS devlink dpipe table show - display devlink dpipe table attributes + +.TP +.BI name " TABLE_NAME" +Specifies the table to operate on. + +.SS devlink dpipe table set - set devlink dpipe table attributes + +.TP +.BI name " TABLE_NAME" +Specifies the table to operate on. + +.SS devlink dpipe table dump - dump devlink dpipe table entries + +.TP +.BI name " TABLE_NAME" +Specifies the table to operate on. + +.SS devlink dpipe header show - display devlink dpipe header attributes + +.TP +.BI name " TABLE_NAME" +Specifies the table to operate on. + +.SH "EXAMPLES" +.PP +devlink dpipe table show pci/0000:01:00.0 +.RS 4 +Shows all dpipe tables on specified devlink device. +.RE +.PP +devlink dpipe table show pci/0000:01:00.0 name mlxsw_erif +.RS 4 +Shows mlxsw_erif dpipe table on specified devlink device. +.RE +.PP +devlink dpipe table set pci/0000:01:00.0 name mlxsw_erif counters_enabled true +.RS 4 +Turns on the counters on mlxsw_erif table. +.RE +.PP +devlink dpipe table dump pci/0000:01:00.0 name mlxsw_erif +.RS 4 +Dumps content of mlxsw_erif table. +.RE +.PP +devlink dpipe header show pci/0000:01:00.0 +.RS 4 +Shows all dpipe headers on specified devlink device. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/devlink-health.8 b/man/man8/devlink-health.8 new file mode 100644 index 0000000..975b8c7 --- /dev/null +++ b/man/man8/devlink-health.8 @@ -0,0 +1,256 @@ +.TH DEVLINK\-HEALTH 8 "20 Feb 2019" "iproute2" "Linux" +.SH NAME +devlink-health \- devlink health reporting and recovery +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B health +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] } + +.ti -8 +.B devlink health show +.RI "[ { " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI ""REPORTER " ] " + +.ti -8 +.B devlink health recover +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" + +.ti -8 +.B devlink health diagnose +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" + +.ti -8 +.B devlink health dump show +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" + +.ti -8 +.BR "devlink health test" +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" + +.ti -8 +.B devlink health dump clear +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" + +.ti -8 +.B devlink health set +.RI "{ " DEV " | " DEV/PORT_INDEX " }" +.B reporter +.RI "" REPORTER "" +[ +.BI "grace_period " MSEC " +] [ +.BR auto_recover " { " true " | " false " } " +] [ +.BR auto_dump " { " true " | " false " } " +] + +.ti -8 +.B devlink health help + +.SH "DESCRIPTION" +.SS devlink health show - Show status and configuration on all supported reporters. +Displays info about reporters registered on devlink devices and ports. + +.PP +.I "DEV" +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.SS devlink health recover - Initiate a recovery operation on a reporter. +This action performs a recovery and increases the recoveries counter on success. + +.PP +.I "DEV" +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.SS devlink health diagnose - Retrieve diagnostics data on a reporter. + +.PP +.I DEV +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.SS devlink health test - Trigger a test event on a reporter. + +.PP +.I "DEV" +- specifies the devlink device. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. + +.SS devlink health dump show - Display the last saved dump. + +.PD 0 +.P +devlink health saves a single dump per reporter. If an dump is +.P +not already stored by the Devlink, this command will generate a new +.P +dump. The dump can be generated either automatically when a +.P +reporter reports on an error or manually at the user's request. +.PD + +.PP +.I "DEV" +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.SS devlink health dump clear - Delete the saved dump. +Deleting the saved dump enables a generation of a new dump on +.PD 0 +.P +the next "devlink health dump show" command. +.PD + +.PP +.I "DEV" +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.SS devlink health set - Configure health reporter. +Please note that some params are not supported on a reporter which +doesn't support a recovery or dump method. + +.PP +.I "DEV" +- specifies the devlink device. +.br +.I DEV/PORT_INDEX +- specifies the devlink port. + +.PP +.I "REPORTER" +- specifies the reporter's name registered on specified devlink device or port. + +.TP +.BI grace_period " MSEC " +Time interval between consecutive auto recoveries. + +.TP +.BR auto_recover " { " true " | " false " } " +Indicates whether the devlink should execute automatic recover on error. + +.TP +.BR auto_dump " { " true " | " false " } " +Indicates whether the devlink should execute automatic dump on error. + +.SH "EXAMPLES" +.PP +devlink health show +.RS 4 +List status and configuration of available reporters on devices and ports. +.RE +.PP +devlink health show pci/0000:00:09.0/1 reporter tx +.RS 4 +List status and configuration of tx reporter registered on port on pci/0000:00:09.0/1 +.RE +.PP +devlink health recover pci/0000:00:09.0 reporter fw_fatal +.RS 4 +Initiate recovery on fw_fatal reporter registered on device on pci/0000:00:09.0. +.RE +.PP +devlink health recover pci/0000:00:09.0/1 reporter tx +.RS 4 +Initiate recovery on tx reporter registered on port on pci/0000:00:09.0/1. +.RE +.PP +devlink health diagnose pci/0000:00:09.0 reporter fw +.RS 4 +List diagnostics data on the specified device and reporter. +.RE +.PP +devlink health dump show pci/0000:00:09.0/1 reporter tx +.RS 4 +Display the last saved dump on the specified port and reporter. +.RE +.PP +devlink health dump clear pci/0000:00:09.0/1 reporter tx +.RS 4 +Delete saved dump on the specified port and reporter. +.RE +.PP +devlink health set pci/0000:00:09.0 reporter fw_fatal grace_period 3500 +.RS 4 +Set time interval between auto recoveries to minimum of 3500 msec on +the specified device and reporter. +.RE +.PP +devlink health set pci/0000:00:09.0/1 reporter tx grace_period 3500 +.RS 4 +Set time interval between auto recoveries to minimum of 3500 msec on +the specified port and reporter. +.RE +.PP +devlink health set pci/0000:00:09.0 reporter fw_fatal auto_recover false +.RS 4 +Turn off auto recovery on the specified device and reporter. + +.RE +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-param (8), +.BR devlink-region (8), +.br + +.SH AUTHOR +Aya Levin <ayal@mellanox.com> diff --git a/man/man8/devlink-lc.8 b/man/man8/devlink-lc.8 new file mode 100644 index 0000000..b588cbc --- /dev/null +++ b/man/man8/devlink-lc.8 @@ -0,0 +1,101 @@ +.TH DEVLINK\-LC 8 "20 Apr 2022" "iproute2" "Linux" +.SH NAME +devlink-lc \- devlink line card configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B lc +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] } + +.ti -8 +.B "devlink lc set" +.IB DEV " lc " LC_INDEX +.RB [ " type " { +.IR LC_TYPE " | " +.BR notype " } ] " + +.ti -8 +.B "devlink lc show" +.RI "[ " DEV " [ " +.BI lc " LC_INDEX +] ] + +.ti -8 +.B devlink lc help + +.SH "DESCRIPTION" +.SS devlink lc set - change line card attributes + +.PP +.TP +.I "DEV" +Specifies the devlink device to operate on. + +.in +4 +Format is: +.in +2 +BUS_NAME/BUS_ADDRESS + +.TP +.BI lc " LC_INDEX " +Specifies index of a line card slot to set. + +.TP +.BR type " { " +.IR LC_TYPE " | " +.BR notype " } " +Type of line card to provision. Each driver provides a list of supported line card types which is shown in the output of +.BR "devlink lc show " command. + +.SS devlink lc show - display line card attributes + +.PP +.TP +.I "DEV" +.RB "Specifies the devlink device to operate on. If this and " lc " arguments are omitted all line cards of all devices are listed. + +.TP +.BI lc " LC_INDEX " +Specifies index of a line card slot to show. + +.SH "EXAMPLES" +.PP +devlink lc show +.RS 4 +Shows the state of all line cards on the system. +.RE +.PP +devlink lc show pci/0000:01:00.0 lc 1 +.RS 4 +Shows the state of line card with index 1. +.RE +.PP +devlink lc set pci/0000:01:00.0 lc 1 type 16x100G +.RS 4 +.RI "Sets type of specified line card to type " 16x100G "." +.RE +.PP +devlink lc set pci/0000:01:00.0 lc 1 notype +.RS 4 +Clears provisioning on a line card. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@nvidia.com> diff --git a/man/man8/devlink-monitor.8 b/man/man8/devlink-monitor.8 new file mode 100644 index 0000000..de351f3 --- /dev/null +++ b/man/man8/devlink-monitor.8 @@ -0,0 +1,39 @@ +.TH DEVLINK\-MONITOR 8 "14 Mar 2016" "iproute2" "Linux" +.SH "NAME" +devlink-monitor \- state monitoring +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.BR "devlink monitor" " [ " all " |" +.IR OBJECT-LIST " ]" +.sp + +.SH DESCRIPTION +The +.B devlink +utility can monitor the state of devlink devices and ports +continuously. This option has a slightly different format. Namely, the +.B monitor +command is the first in the command line and then the object list. + +.I OBJECT-LIST +is the list of object types that we want to monitor. +It may contain +.BR dev ", " port ", " health ", " trap ", " trap-group ", " trap-policer . + +.B devlink +opens Devlink Netlink socket, listens on it and dumps state changes. + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-sb (8), +.BR devlink-port (8), +.BR devlink-health (8), +.BR devlink-trap (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 new file mode 100644 index 0000000..70d8837 --- /dev/null +++ b/man/man8/devlink-port.8 @@ -0,0 +1,417 @@ +.TH DEVLINK\-PORT 8 "14 Mar 2016" "iproute2" "Linux" +.SH NAME +devlink-port \- devlink port configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B port +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-n\fR[\fIno-nice-names\fR] } + +.ti -8 +.BR "devlink port set " +.IR DEV/PORT_INDEX +.RI "[ " +.BR type " { " eth " | " ib " | " auto " }" +.RI "]" + +.ti -8 +.BR "devlink port split " +.IR DEV/PORT_INDEX +.BR count +.IR COUNT + +.ti -8 +.BR "devlink port unsplit " +.IR DEV/PORT_INDEX + +.ti -8 +.B devlink port show +.RI "[ " DEV/PORT_INDEX " ]" + +.ti -8 +.B devlink port health +.RI "{ " show " | " recover " | " diagnose " | " dump " | " set " }" + +.ti -8 +.BI "devlink port add" +.RB "{" +.IR "DEV | DEV/PORT_INDEX" +.RB "} " +.RB "[ " flavour +.IR FLAVOUR " ]" +.RB "[ " pfnum +.IR PFNUMBER " ]" +.RB "[ " sfnum +.IR SFNUMBER " ]" +.RB "[ " controller +.IR CNUM " ]" +.br + +.ti -8 +.B devlink port del +.IR DEV/PORT_INDEX + +.ti -8 +.BR "devlink port function set " +.IR DEV/PORT_INDEX +.RI "[ " +.BR "hw_addr " +.RI "ADDR ]" +.RI "[ " +.BR state " { " active " | " inactive " }" +.RI "]" +.RI "[ " +.BR roce " { " enable " | " disable " }" +.RI "]" +.RI "[ " +.BR migratable " { " enable " | " disable " }" +.RI "]" +.RI "[ " +.BR ipsec_crypto " { " enable " | " disable " }" +.RI "]" +.RI "[ " +.BR ipsec_packet " { " enable " | " disable " }" +.RI "]" + +.ti -8 +.BR "devlink port function rate " +.RI "{ " show " | " set " | " add " | " del " | " help " }" + +.ti -8 +.B devlink dev param set +.I DEV/PORT_INDEX +.B name +.I PARAMETER +.B value +.I VALUE +.BR cmode " { " runtime " | " driverinit " | " permanent " } " + +.ti -8 +.B devlink dev param show +[ +.I DEV/PORT_INDEX +.B name +.I PARAMETER +] + +.ti -8 +.B devlink port help + +.SH "DESCRIPTION" +.SS devlink port set - change devlink port attributes + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.in +4 +Format is: +.in +2 +BUS_NAME/BUS_ADDRESS/PORT_INDEX + +.TP +.BR type " { " eth " | " ib " | " auto " } " +set port type + +.I eth +- Ethernet + +.I ib +- Infiniband + +.I auto +- autoselect + +.SS devlink port split - split devlink port into more + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.TP +.BI count " COUNT" +number of ports to split to. + +.SS devlink port unsplit - unsplit previously split devlink port +Could be performed on any split port of the same split group. + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.SS devlink port show - display devlink port attributes + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to show. +If this argument is omitted all ports are listed. + +.SS devlink port health - devlink health reporting and recovery +Is an alias for +.BR devlink-health (8). + +.ti -8 +.SS devlink port add - add a devlink port +.PP +.I "DEV" +- specifies the devlink device to operate on. or + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port index to use for the requested new port. +This is optional. When omitted, driver allocates unique port index. + +.TP +.BR flavour " { " pcipf " | " pcisf " } " +set port flavour + +.I pcipf +- PCI PF port + +.I pcisf +- PCI SF port + +.TP +.BI pfnum " PFNUMBER " +Specifies PCI pfnumber to use on which a SF device to create + +.TP +.BI sfnum " SFNUMBER " +Specifies sfnumber to assign to the device of the SF. +This field is optional for those devices which supports auto assignment of the +SF number. + +.TP +.BI controller " CNUM " +Specifies controller number for which the SF port is created. +This field is optional. It is used only when SF port is created for the +external controller. + +.ti -8 +.SS devlink port function set - Set the port function attribute(s). + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.TP +.BI hw_addr " ADDR" +Hardware address of the function to set. This is a Ethernet MAC address when +port type is Ethernet. + +.TP +.BR state " { " active " | " inactive " } " +New state of the function to change to. + +.I active +- Once configuration of the function is done, activate the function. + +.I inactive +- To inactivate the function and its device(s), set to inactive. + +.TP +.BR roce " { " enable " | " disable " } " +Set the RoCE capability of the function. + +.TP +.BR migratable " { " enable " | " disable " } " +Set the migratable capability of the function. + +.TP +.BR ipsec_crypto " { " enable " | " disable " } " +Set the IPsec crypto offload capability of the function. Controls XFRM state +crypto operation (Encrypt/Decrypt) offload. + +.TP +.BR ipsec_packet " { " enable " | " disable " } " +Set the IPsec packet offload capability of the function. Controls XFRM state +and policy offload (Encrypt/Decrypt operation and IPsec encapsulation). + +.ti -8 +.SS devlink port del - delete a devlink port +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to delete. + +.ti -8 +.SS devlink port param set - set new value to devlink port configuration parameter +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.TP +.BI name " PARAMETER" +Specify parameter name to set. + +.TP +.BI value " VALUE" +New value to set. + +.TP +.BR cmode " { " runtime " | " driverinit " | " permanent " } " +Configuration mode in which the new value is set. + +.I runtime +- Set new value while driver is running. This configuration mode doesn't require any reset to apply the new value. + +.I driverinit +- Set new value which will be applied during driver initialization. This configuration mode requires restart driver by devlink reload command to apply the new value. + +.I permanent +- New value is written to device's non-volatile memory. This configuration mode requires hard reset to apply the new value. + +.SS devlink port param show - display devlink port supported configuration parameters attributes + +.PP +.I "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.B name +.I PARAMETER +Specify parameter name to show. +If this argument, as well as port index, are omitted - all parameters supported by devlink device ports are listed. + +.SS devlink port function rate - manage devlink rate objects +Is an alias for +.BR devlink-rate (8). + +.SH "EXAMPLES" +.PP +devlink port show +.RS 4 +Shows the state of all devlink ports on the system. +.RE +.PP +devlink port show pci/0000:01:00.0/1 +.RS 4 +Shows the state of specified devlink port. +.RE +.PP +devlink port set pci/0000:01:00.0/1 type eth +.RS 4 +Set type of specified devlink port to Ethernet. +.RE +.PP +devlink port split pci/0000:01:00.0/1 count 4 +.RS 4 +Split the specified devlink port into four ports. +.RE +.PP +devlink port unsplit pci/0000:01:00.0/1 +.RS 4 +Unplit the specified previously split devlink port. +.RE +.PP +devlink port health show +.RS 4 +Shows status and configuration of all supported reporters registered on all devlink ports. +.RE +.PP +devlink port health show pci/0000:01:00.0/1 reporter tx +.RS 4 +Shows status and configuration of tx reporter registered on pci/0000:01:00.0/1 devlink port. +.RE +.PP +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 +.RS 4 +Add a devlink port of flavour PCI SF on PCI PF having number 0 with SF number 88. +To make use of the function an example sequence is to add a port, configure the +function attribute and activate the function. Once function usage is completed, +inactivate the function and finally delete the port. When there is desire to +reuse the port without deletion, it can be reconfigured and activated again when +function is in inactive state and function's operational state is detached. +.RE +.PP +devlink port del pci/0000:06:00.0/1 +.RS 4 +Delete previously created devlink port. It is recommended to first deactivate +the function if the function supports state management. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 hw_addr 00:00:00:11:22:33 +.RS 4 +Configure hardware address of the PCI function represented by devlink port. +If the port supports change in function state, hardware address must be configured +before activating the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 state active +.RS 4 +Activate the function. This will initiate the function enumeration and driver loading. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 state inactive +.RS 4 +Deactivate the function. This will initiate the function teardown which results +in driver unload and device removal. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 roce enable +.RS 4 +This will enable the RoCE functionality of the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 migratable enable +.RS 4 +This will enable the migratable functionality of the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 ipsec_crypto enable +.RS 4 +This will enable the IPsec crypto offload functionality of the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 ipsec_packet enable +.RS 4 +This will enable the IPsec packet offload functionality of the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 hw_addr 00:00:00:11:22:33 state active +.RS 4 +Configure hardware address and also active the function. When a function is +activated together with other configuration in a single command, all the +configuration is applied first before changing the state to active. +.RE +.PP +devlink dev param show +.RS 4 +Shows (dumps) all the port parameters across all the devices registered in the devlink. +.RE +.PP +devlink dev param set pci/0000:01:00.0/1 name internal_error_reset value true cmode runtime +.RS 4 +Sets the parameter internal_error_reset of specified devlink port (#1) to true. +.RE +.PP +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 controller 1 +.RS 4 +Add a devlink port of flavour PCI SF on controller 1 which has PCI PF of number +0 with SF number 88. To make use of the function an example sequence is to add +a port, configure the function attribute and activate the function. Once +the function usage is completed, deactivate the function and finally delete +the port. When there is desire to reuse the port without deletion, it can be +reconfigured and activated again when function is in inactive state and +function's operational state is detached. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-sb (8), +.BR devlink-monitor (8), +.BR devlink-health (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/devlink-rate.8 b/man/man8/devlink-rate.8 new file mode 100644 index 0000000..f09ac4a --- /dev/null +++ b/man/man8/devlink-rate.8 @@ -0,0 +1,292 @@ +.TH DEVLINK\-RATE 8 "12 Mar 2021" "iproute2" "Linux" +.SH NAME +devlink-rate \- devlink rate management +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B port function rate +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +.BR -j [ \fIson "] | " -p [ \fIretty "] | " -i [ \fIec "] }" + +.ti -8 +.B devlink port function rate show +.RI "[ { " DEV/PORT_INDEX " | " DEV/NODE_NAME " } ]" + +.ti -8 +.B devlink port function rate set +.RI "{ " DEV/PORT_INDEX " | " DEV/NODE_NAME " } " +.RB [ " tx_share \fIVALUE " ] +.RB [ " tx_max \fIVALUE " ] +.RB [ " tx_priority \fIN " ] +.RB [ " tx_weight \fIN " ] +.RB "[ {" " parent \fINODE_NAME " | " noparent " "} ]" + +.ti -8 +.BI "devlink port function rate add " DEV/NODE_NAME +.RB [ " tx_share \fIVALUE " ] +.RB [ " tx_max \fIVALUE " ] +.RB [ " tx_priority \fIN " ] +.RB [ " tx_weight \fIN " ] +.RB "[ {" " parent \fINODE_NAME " | " noparent " "} ]" + +.ti -8 +.BI "devlink port function rate del " DEV/NODE_NAME + +.ti -8 +.B devlink port function rate help + +.SH "DESCRIPTION" + +.SS devlink port function rate show - display rate objects. +Displays specified rate object or, if not specified, all rate objects. Rate +object can be presented by one of the two types: +.TP 8 +.B leaf +Represents a single devlink port; created/destroyed by the driver and bound to +the devlink port. As example, some driver may create leaf rate object for every +devlink port associated with VF. Since leaf have 1to1 mapping to it's devlink +port, in user space it is referred as corresponding devlink port +\fIDEV/PORT_INDEX\fR; +.TP 8 +.B node +Represents a group of rate objects; created/deleted by the user (see command +below) and bound to the devlink device rather then to the devlink port. In +userspace it is referred as \fIDEV/NODE_NAME\fR, where node name can be any, +except decimal number, to avoid collisions with leafs. +.PP +Command output show rate object identifier, it's type and rate values along with +parent node name. Rate values printed in SI units which are more suitable to +represent specific value. To print values in IEC units \fB-i\fR switch is +used. JSON (\fB-j\fR) output always print rate values in bytes per second. Zero +rate values means "unlimited" rates and omitted in output, as well as parent +node name. + +.SS devlink port function rate set - set rate object parameters. +Allows set rate object's parameters. If any parameter specified multiple times +the last occurrence is used. +.PP +.I DEV/PORT_INDEX +- specifies devlink leaf rate object. +.br +.I DEV/NODE_NAME +- specifies devlink node rate object. +.PP +.BI tx_share " VALUE" +- specifies minimal tx rate value shared among all rate objects. If rate object +is a part of some rate group, then this value shared with rate objects of this +rate group. +.PP +.BI tx_max " VALUE" +- specifies maximum tx rate value. +.PP +.BI tx_priority " N" +- allows for usage of strict priority arbiter among siblings. This arbitration +scheme attempts to schedule nodes based on their priority as long as the nodes +remain within their bandwidth limit. The higher the priority the higher the +probability that the node will get selected for scheduling. +.PP +.BI tx_weight " N" +- allows for usage of Weighted Fair Queuing arbitration scheme among siblings. +This arbitration scheme can be used simultaneously with the strict priority. +As a node is configured with a higher rate it gets more BW relative to it's +siblings. Values are relative like a percentage points, they basically tell +how much BW should node take relative to it's siblings. +.PP +.TP 8 +.I VALUE +These parameter accept a floating point number, possibly followed by either a +unit (both SI and IEC units supported). +.RS +.TP +bit or a bare number +Bits per second +.TP +kbit +Kilobits per second +.TP +mbit +Megabits per second +.TP +gbit +Gigabits per second +.TP +tbit +Terabits per second +.TP +bps +Bytes per second +.TP +kbps +Kilobytes per second +.TP +mbps +Megabytes per second +.TP +gbps +Gigabytes per second +.TP +tbps +Terabytes per second +.P +To specify in IEC units, replace the SI prefix (k-, m-, g-, t-) with IEC prefix +(ki-, mi-, gi- and ti-) respectively. Input is case-insensitive. +.RE +.PP +.TP 8 +.I N +These parameter accept integer meaning weight or priority of a node. +.PP +.BI parent " NODE_NAME \fR| " noparent +- set rate object parent to existing node with name \fINODE_NAME\fR or unset +parent. Rate limits of the parent node applied to all it's children. Actual +behaviour is details of driver's implementation. Setting parent to empty ("") +name due to the kernel logic treated as parent unset. + +.SS devlink port function rate add - create node rate object with specified parameters. +Creates rate object of type node and sets parameters. Parameters same as for the +"set" command. +.PP +.I DEV/NODE_NAME +- specifies the devlink node rate object to create. + +.SS devlink port function rate del - delete node rate object +Delete specified devlink node rate object. Node can't be deleted if there is any +child, user must explicitly unset the parent. +.PP +.I DEV/NODE_NAME +- specifies devlink node rate object to delete. + +.SS devlink port function rate help - display usage information +Display devlink rate usage information + +.SH "EXAMPLES" + +.PP +\fB*\fR Display all rate objects: +.RS 4 +.PP +# devlink port function rate show +.br +pci/0000:03:00.0/1 type leaf parent some_group +.br +pci/0000:03:00.0/2 type leaf tx_share 12Mbit +.br +pci/0000:03:00.0/some_group type node tx_share 1Gbps tx_max 5Gbps +.RE + +.PP +\fB*\fR Display leaf rate object bound to the 1st devlink port of the +pci/0000:03:00.0 device: +.RS 4 +.PP +# devlink port function rate show pci/0000:03:00.0/1 +.br +pci/0000:03:00.0/1 type leaf +.br +.RE + +.PP +\fB*\fR Display leaf rate object rate values using IEC units: +.RS 4 +.PP +# devlink -i port function rate show pci/0000:03:00.0/2 +.br +pci/0000:03:00.0/2 type leaf 11718Kibit +.br +.RE + +.PP +\fB*\fR Display node rate object with name some_group of the pci/0000:03:00.0 device: +.RS 4 +.PP +# devlink port function rate show pci/0000:03:00.0/some_group +.br +pci/0000:03:00.0/some_group type node +.br +.RE + +.PP +\fB*\fR Display pci/0000:03:00.0/2 leaf rate object as pretty JSON output: +.RS 4 +.PP +# devlink -jp port function rate show pci/0000:03:00.0/2 +.br +{ +.br + "rate": { +.br + "pci/0000:03:00.0/2": { +.br + "type": "leaf", +.br + "tx_share": 1500000 +.br + } +.br + } +.br +} +.RE + +.PP +\fB*\fR Create node rate object with name "1st_group" on pci/0000:03:00.0 device: +.RS 4 +.PP +# devlink port function rate add pci/0000:03:00.0/1st_group +.RE + +.PP +\fB*\fR Create node rate object with specified parameters: +.RS 4 +.PP +# devlink port function rate add pci/0000:03:00.0/2nd_group \\ +.br + tx_share 10Mbit tx_max 30Mbit parent 1st_group +.RE + +.PP +\fB*\fR Set parameters to the specified leaf rate object: +.RS 4 +.PP +# devlink port function rate set pci/0000:03:00.0/1 \\ +.br + tx_share 2Mbit tx_max 10Mbit +.RE + +.PP +\fB*\fR Set leaf's parent to "1st_group": +.RS 4 +.PP +# devlink port function rate set pci/0000:03:00.0/1 parent 1st_group +.RE + +.PP +\fB*\fR Unset leaf's parent: +.RS 4 +.PP +# devlink port function rate set pci/0000:03:00.0/1 noparent +.RE + +.PP +\fB*\fR Delete node rate object: +.RS 4 +.PP +# devlink port function rate del pci/0000:03:00.0/2nd_group +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-port (8) +.br + +.SH AUTHOR +Dmytro Linkin <dlinkin@nvidia.com> diff --git a/man/man8/devlink-region.8 b/man/man8/devlink-region.8 new file mode 100644 index 0000000..b706796 --- /dev/null +++ b/man/man8/devlink-region.8 @@ -0,0 +1,156 @@ +.TH DEVLINK\-REGION 8 "10 Jan 2018" "iproute2" "Linux" +.SH NAME +devlink-region \- devlink address region access +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B region +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-n\fR[\fIno-nice-names\fR] } + +.ti -8 +.BR "devlink region show" +.RI "[ " DEV/REGION " ]" + +.ti -8 +.BR "devlink region new" +.RI "" DEV/REGION "" +.BR "[ " +.BR "snapshot" +.RI "" SNAPSHOT_ID "" +.BR "]" + +.ti -8 +.BR "devlink region del" +.RI "" DEV/REGION "" +.BR "snapshot" +.RI "" SNAPSHOT_ID "" + +.ti -8 +.BR "devlink region dump" +.RI "" DEV/REGION "" +.BR "snapshot" +.RI "" SNAPSHOT_ID "" + +.ti -8 +.BR "devlink region read" +.RI "" DEV/REGION "" +.BR "[ " +.BR "snapshot" +.RI "" SNAPSHOT_ID "" +.BR "]" +.BR "address" +.RI "" ADDRESS " +.BR "length" +.RI "" LENGTH "" + +.ti -8 +.B devlink region help + +.SH "DESCRIPTION" +.SS devlink region show - Show all supported address regions names, snapshots and sizes + +.PP +.I "DEV/REGION" +- specifies the devlink device and address-region to query. + +.SS devlink region new - Create a snapshot specified by address-region name and snapshot ID + +.PP +.I "DEV/REGION" +- specifies the devlink device and address-region to snapshot + +.PP +snapshot +.I "SNAPSHOT_ID" +- optionally specifies the snapshot ID to assign. If not specified, devlink will assign a unique ID to the snapshot. + +.SS devlink region del - Delete a snapshot specified by address-region name and snapshot ID + +.PP +.I "DEV/REGION" +- specifies the devlink device and address-region to delete the snapshot from + +.PP +snapshot +.I "SNAPSHOT_ID" +- specifies the snapshot ID to delete + +.SS devlink region dump - Dump all the available data from a region or from snapshot of a region + +.PP +.I "DEV/REGION" +- specifies the device and address-region to dump from. + +.PP +snapshot +.I "SNAPSHOT_ID" +- specifies the snapshot-id of the region to dump. + +.SS devlink region read - Read from a specific region address for a given length + +.PP +.I "DEV/REGION" +- specifies the device and address-region to read from. + +.PP +snapshot +.I "SNAPSHOT_ID" +- specifies the snapshot-id of the region to read. + +.PP +address +.I "ADDRESS" +- specifies the address to read from. + +.PP +length +.I "LENGTH" +- specifies the length of data to read. + +.SH "EXAMPLES" +.PP +devlink region show +.RS 4 +List available address regions and snapshot. +.RE +.PP +devlink region new pci/0000:00:05.0/cr-space +.RS 4 +Create a new snapshot from cr-space address region from device pci/0000:00:05.0. +.RE +.PP +devlink region del pci/0000:00:05.0/cr-space snapshot 1 +.RS 4 +Delete snapshot id 1 from cr-space address region from device pci/0000:00:05.0. +.RE +.PP +devlink region dump pci/0000:00:05.0/cr-space snapshot 1 +.RS 4 +Dump the snapshot taken from cr-space address region with ID 1 +.RE +.PP +devlink region read pci/0000:00:05.0/cr-space snapshot 1 address 0x10 length 16 +.RS 4 +Read from address 0x10, 16 Bytes of snapshot ID 1 taken from cr-space address region +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Alex Vesker <valex@mellanox.com> diff --git a/man/man8/devlink-resource.8 b/man/man8/devlink-resource.8 new file mode 100644 index 0000000..8c31580 --- /dev/null +++ b/man/man8/devlink-resource.8 @@ -0,0 +1,79 @@ +.TH DEVLINK\-RESOURCE 8 "11 Feb 2018" "iproute2" "Linux" +.SH NAME +devlink-resource \- devlink device resource configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B resource +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-v\fR[\fIerbose\fR] } + +.ti -8 +.B devlink resource show +.IR DEV + +.ti -8 +.B devlink resource help + +.ti -8 +.BR "devlink resource set" +.IR DEV +.BI path " RESOURCE_PATH" +.BI size " RESOURCE_SIZE" + +.SH "DESCRIPTION" +.SS devlink resource show - display devlink device's resosources + +.PP +.I "DEV" +- specifies the devlink device to show. + +.in +4 +Format is: +.in +2 +BUS_NAME/BUS_ADDRESS + +.SS devlink resource set - sets resource size of specific resource + +.PP +.I "DEV" +- specifies the devlink device. + +.TP +.BI path " RESOURCE_PATH" +Resource's path. + +.TP +.BI size " RESOURCE_SIZE" +The new resource's size. + +.SH "EXAMPLES" +.PP +devlink resource show pci/0000:01:00.0 +.RS 4 +Shows the resources of the specified devlink device. +.RE +.PP +devlink resource set pci/0000:01:00.0 /kvd/linear 98304 +.RS 4 +Sets the size of the specified resource for the specified devlink device. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-port (8), +.BR devlink-sb (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Arkadi Sharshevsky <arkadis@mellanox.com> diff --git a/man/man8/devlink-sb.8 b/man/man8/devlink-sb.8 new file mode 100644 index 0000000..5a5a9bb --- /dev/null +++ b/man/man8/devlink-sb.8 @@ -0,0 +1,324 @@ +.TH DEVLINK\-SB 8 "14 Apr 2016" "iproute2" "Linux" +.SH NAME +devlink-sb \- devlink shared buffer configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B sb +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-n\fR[\fIno-nice-names\fR] } + +.ti -8 +.BR "devlink sb show " +.RI "[ " DEV " [ " +.B sb +.IR SB_INDEX " ] ]" + +.ti -8 +.BR "devlink sb pool show " +.RI "[ " DEV " [ " +.B sb +.IR SB_INDEX " ] " +.br +.B pool +.IR POOL_INDEX " ]" + +.ti -8 +.BI "devlink sb pool set " DEV " +.RB "[ " sb +.IR SB_INDEX " ] " +.br +.BI pool " POOL_INDEX " +.br +.BI size " POOL_SIZE " +.br +.BR thtype " { " static " | " dynamic " }" + +.ti -8 +.BR "devlink sb port pool show " +.RI "[ " DEV/PORT_INDEX " [ " +.B sb +.IR SB_INDEX " ] " +.br +.B pool +.IR POOL_INDEX " ]" + +.ti -8 +.BI "devlink sb port pool set " DEV/PORT_INDEX " +.RB "[ " sb +.IR SB_INDEX " ] " +.br +.BI pool " POOL_INDEX " +.br +.BI th " THRESHOLD " + +.ti -8 +.BR "devlink sb tc bind show " +.RI "[ " DEV/PORT_INDEX " [ " +.B sb +.IR SB_INDEX " ] " +.br +.BI tc " TC_INDEX " +.br +.B type +.RB "{ " ingress " | " egress " } ]" + +.ti -8 +.BI "devlink sb tc bind set " DEV/PORT_INDEX " +.RB "[ " sb +.IR SB_INDEX " ] " +.br +.BI tc " TC_INDEX " +.br +.BR type " { " ingress " | " egress " }" +.br +.BI pool " POOL_INDEX " +.br +.BI th " THRESHOLD " + +.ti -8 +.BR "devlink sb occupancy show " +.RI "{ " DEV " | " DEV/PORT_INDEX " } [ " +.B sb +.IR SB_INDEX " ] " + +.ti -8 +.BR "devlink sb occupancy snapshot " +.IR DEV " [ " +.B sb +.IR SB_INDEX " ]" + +.ti -8 +.BR "devlink sb occupancy clearmax " +.IR DEV " [ " +.B sb +.IR SB_INDEX " ]" + +.ti -8 +.B devlink sb help + +.SH "DESCRIPTION" +.SS devlink sb show - display available shared buffers and their attributes + +.PP +.I "DEV" +- specifies the devlink device to show shared buffers. +If this argument is omitted all shared buffers of all devices are listed. + +.PP +.I "SB_INDEX" +- specifies the shared buffer. +If this argument is omitted shared buffer with index 0 is selected. +Behaviour of this argument it the same for every command. + +.SS devlink sb pool show - display available pools and their attributes + +.PP +.I "DEV" +- specifies the devlink device to show pools. +If this argument is omitted all pools of all devices are listed. + +Display available pools listing their +.B type, size, thtype +and +.B cell_size. cell_size +is the allocation granularity of memory within the shared buffer. Drivers +may round up, round down or reject +.B size +passed to the set command if it is not multiple of +.B cell_size. + +.SS devlink sb pool set - set attributes of pool + +.PP +.I "DEV" +- specifies the devlink device to set pool. + +.TP +.BI size " POOL_SIZE" +size of the pool in Bytes. + +.TP +.BR thtype " { " static " | " dynamic " } " +pool threshold type. + +.I static +- Threshold values for the pool will be passed in Bytes. + +.I dynamic +- Threshold values ("to_alpha") for the pool will be used to compute alpha parameter according to formula: +.br +.in +16 +alpha = 2 ^ (to_alpha - 10) +.in -16 + +.in +10 +The range of the passed value is between 0 to 20. The computed alpha is used to determine the maximum usage of the flow: +.in -10 +.br +.in +16 +max_usage = alpha / (1 + alpha) * Free_Buffer +.in -16 + +.SS devlink sb port pool show - display port-pool combinations and threshold for each +.I "DEV/PORT_INDEX" +- specifies the devlink port. + +.TP +.BI pool " POOL_INDEX" +pool index. + +.SS devlink sb port pool set - set port-pool threshold +.I "DEV/PORT_INDEX" +- specifies the devlink port. + +.TP +.BI pool " POOL_INDEX" +pool index. + +.TP +.BI th " THRESHOLD" +threshold value. Type of the value is either Bytes or "to_alpha", depends on +.B thtype +set for the pool. + +.SS devlink sb tc bind show - display port-TC to pool bindings and threshold for each + +.I "DEV/PORT_INDEX" +- specifies the devlink port. + +.TP +.BI tc " TC_INDEX" +index of either ingress or egress TC, usually in range 0 to 8 (depends on device). + +.TP +.BR type " { " ingress " | " egress " } " +TC type. + +.SS devlink sb tc bind set - set port-TC to pool binding with specified threshold + +.I "DEV/PORT_INDEX" +- specifies the devlink port. + +.TP +.BI tc " TC_INDEX" +index of either ingress or egress TC, usually in range 0 to 8 (depends on device). + +.TP +.BR type " { " ingress " | " egress " } " +TC type. + +.TP +.BI pool " POOL_INDEX" +index of pool to bind this to. + +.TP +.BI th " THRESHOLD" +threshold value. Type of the value is either Bytes or "to_alpha", depends on +.B thtype +set for the pool. + +.SS devlink sb occupancy show - display shared buffer occupancy values for device or port + +.PP +This command is used to browse shared buffer occupancy values. Values are showed for every port-pool combination as well as for all port-TC combinations (with pool this port-TC is bound to). Format of value is: +.br +.in +16 +current_value/max_value +.in -16 +Note that before showing values, one has to issue +.B occupancy snapshot +command first. + +.PP +.I "DEV" +- specifies the devlink device to show occupancy values for. + +.I "DEV/PORT_INDEX" +- specifies the devlink port to show occupancy values for. + +.SS devlink sb occupancy snapshot - take occupancy snapshot of shared buffer for device +This command is used to take a snapshot of shared buffer occupancy values. After that, the values can be showed using +.B occupancy show +command. + +.PP +.I "DEV" +- specifies the devlink device to take occupancy snapshot on. + +.SS devlink sb occupancy clearmax - clear occupancy watermarks of shared buffer for device +This command is used to reset maximal occupancy values reached for whole device. Note that before browsing reset values, one has to issue +.B occupancy snapshot +command. + +.PP +.I "DEV" +- specifies the devlink device to clear occupancy watermarks on. + +.SH "EXAMPLES" +.PP +devlink sb show +.RS 4 +List available share buffers. +.RE +.PP +devlink sb pool show +.RS 4 +List available pools and their config. +.RE +.PP +devlink sb port pool show pci/0000:03:00.0/1 pool 0 +.RS 4 +Show port-pool setup for specified port and pool. +.RE +.PP +sudo devlink sb port pool set pci/0000:03:00.0/1 pool 0 th 15 +.RS 4 +Change threshold for port specified port and pool. +.RE +.PP +devlink sb tc bind show pci/0000:03:00.0/1 tc 0 type ingress +.RS 4 +Show pool binding and threshold for specified port and TC. +.RE +.PP +sudo devlink sb tc bind set pci/0000:03:00.0/1 tc 0 type ingress pool 0 th 9 +.RS 4 +Set pool binding and threshold for specified port and TC. +.RE +.PP +sudo devlink sb occupancy snapshot pci/0000:03:00.0 +.RS 4 +Make a snapshot of occupancy of shared buffer for specified devlink device. +.RE +.PP +devlink sb occupancy show pci/0000:03:00.0/1 +.RS 4 +Show occupancy for specified port from the snapshot. +.RE +.PP +sudo devlink sb occupancy clearmax pci/0000:03:00.0 +.RS 4 +Clear watermarks for shared buffer of specified devlink device. +.RE + + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/devlink-trap.8 b/man/man8/devlink-trap.8 new file mode 100644 index 0000000..f5e6641 --- /dev/null +++ b/man/man8/devlink-trap.8 @@ -0,0 +1,195 @@ +.TH DEVLINK\-TRAP 8 "2 August 2019" "iproute2" "Linux" +.SH NAME +devlink-trap \- devlink trap configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B trap +.RI "{ " COMMAND " |" +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-v\fR[\fIerbose\fR] | +\fB\-s\fR[\fItatistics\fR] } + +.ti -8 +.B "devlink trap show" +.RI "[ " DEV +.B trap +.IR TRAP " ]" + +.ti -8 +.BI "devlink trap set " DEV " trap " TRAP +.RB "[ " action " { " trap " | " drop " | " mirror " } ]" + +.ti -8 +.B "devlink trap group show" +.RI "[ " DEV +.B group +.IR GROUP " ]" + +.ti -8 +.BI "devlink trap group set " DEV " group " GROUP +.RB "[ " action " { " trap " | " drop " | " mirror " } ]" +.br +.RB "[ " policer +.IB "POLICER " ] +.RB "[ " nopolicer " ]" + +.ti -8 +.BI "devlink trap policer set " DEV " policer " POLICER +.RB "[ " rate +.IR "RATE " ] +.RB "[ " burst +.IR "BURST " ] + +.ti -8 +.B devlink trap help + +.SH "DESCRIPTION" +.SS devlink trap show - display available packet traps and their attributes + +.PP +.I "DEV" +- specifies the devlink device from which to show packet traps. +If this argument is omitted all packet traps of all devices are listed. + +.PP +.BI "trap " TRAP +- specifies the packet trap. +Only applicable if a devlink device is also specified. + +.SS devlink trap set - set attributes of a packet trap + +.PP +.I "DEV" +- specifies the devlink device the packet trap belongs to. + +.PP +.BI "trap " TRAP +- specifies the packet trap. + +.TP +.BR action " { " trap " | " drop " | " mirror " } " +packet trap action. + +.I trap +- the sole copy of the packet is sent to the CPU. + +.I drop +- the packet is dropped by the underlying device and a copy is not sent to the CPU. + +.I mirror +- the packet is forwarded by the underlying device and a copy is sent to the CPU. + +.SS devlink trap group show - display available packet trap groups and their attributes + +.PP +.I "DEV" +- specifies the devlink device from which to show packet trap groups. +If this argument is omitted all packet trap groups of all devices are listed. + +.PP +.BI "group " GROUP +- specifies the packet trap group. +Only applicable if a devlink device is also specified. + +.SS devlink trap group set - set attributes of a packet trap group + +.PP +.I "DEV" +- specifies the devlink device the packet trap group belongs to. + +.PP +.BI "group " GROUP +- specifies the packet trap group. + +.TP +.BR action " { " trap " | " drop " | " mirror " } " +packet trap action. The action is set for all the packet traps member in the +trap group. The actions of non-drop traps cannot be changed and are thus +skipped. + +.TP +.BI policer " POLICER" +packet trap policer. The policer to bind to the packet trap group. A value of +"0" will unbind the currently bound policer. + +.TP +.B nopolicer +Unbind packet trap policer from the packet trap group. + +.SS devlink trap policer set - set attributes of packet trap policer + +.PP +.I "DEV" +- specifies the devlink device the packet trap policer belongs to. + +.PP +.BI "policer " POLICER +- specifies the packet trap policer. + +.PP +.BI rate " RATE " +- packet trap policer rate in packets per second. + +.PP +.BI burst " BURST " +- packet trap policer burst size in packets. + +.SH "EXAMPLES" +.PP +devlink trap show +.RS 4 +List available packet traps. +.RE +.PP +devlink trap group show +.RS 4 +List available packet trap groups. +.RE +.PP +devlink -vs trap show pci/0000:01:00.0 trap source_mac_is_multicast +.RS 4 +Show attributes and statistics of a specific packet trap. +.RE +.PP +devlink -s trap group show pci/0000:01:00.0 group l2_drops +.RS 4 +Show attributes and statistics of a specific packet trap group. +.RE +.PP +devlink trap set pci/0000:01:00.0 trap source_mac_is_multicast action trap +.RS 4 +Set the action of a specific packet trap to 'trap'. +.RE +.PP +devlink trap policer show +.RS 4 +List available packet trap policers. +.RE +.PP +devlink -s trap policer show pci/0000:01:00.0 policer 1 +.RS 4 +Show attributes and statistics of a specific packet trap policer. +.RE +.PP +devlink trap policer set pci/0000:01:00.0 policer 1 rate 1000 burst 128 +.RS 4 +Set the rate and burst size of a specific packet trap policer. +.RE + +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-monitor (8), +.br + +.SH AUTHOR +Ido Schimmel <idosch@mellanox.com> diff --git a/man/man8/devlink.8 b/man/man8/devlink.8 new file mode 100644 index 0000000..de53061 --- /dev/null +++ b/man/man8/devlink.8 @@ -0,0 +1,147 @@ +.TH DEVLINK 8 "14 Mar 2016" "iproute2" "Linux" +.SH NAME +devlink \- Devlink tool +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource | region | health | trap " } { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B devlink +.RB "[ " -force " ] " +.BI "-batch " filename +.sp + +.SH OPTIONS + +.TP +.BR "\-V" , " --Version" +Print the version of the +.B devlink +utility and exit. + +.TP +.BR "\-b", " \-batch " <FILENAME> +Read commands from provided file or standard input and invoke them. +First failure will cause termination of devlink. + +.TP +.B \-force +Don't terminate devlink on errors in batch mode. +If there were any errors during execution of the commands, the application return code will be non zero. + +.TP +.BR "\-n" , " --no-nice-names" +Turn off printing out nice names, for example netdevice ifnames instead of devlink port identification. + +.TP +.BR "\-j" , " --json" +Generate JSON output. + +.TP +.BR "\-p" , " --pretty" +When combined with -j generate a pretty JSON output. + +.TP +.BR "\-v" , " --verbose" +Turn on verbose output. + +.TP +.BR "\-s" , " --statistics" +Output statistics. + +.TP +.BR "\-N", " \-Netns " <NETNSNAME> +Switches to the specified network namespace. + +.TP +.BR "\-i", " --iec" +Print human readable rates in IEC units (e.g. 1Ki = 1024). + +.TP +.BR "\-x", " --hex" +Print dump numbers in hexadecimal format. + +.SS +.I OBJECT + +.TP +.B dev +- devlink device. + +.TP +.B port +- devlink port. + +.TP +.B monitor +- watch for netlink messages. + +.TP +.B sb +- devlink shared buffer configuration. + +.TP +.B resource +- devlink device resource configuration. + +.TP +.B region +- devlink address region access + +.TP +.B health +- devlink reporting and recovery + +.TP +.B trap +- devlink trap configuration + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to +.B show +(or +.B list +) objects, but some objects do not allow all of these operations +or have some additional commands. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B list +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-monitor (8), +.BR devlink-sb (8), +.BR devlink-resource (8), +.BR devlink-region (8), +.BR devlink-health (8), +.BR devlink-trap (8), +.br + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Jiri Pirko <jiri@mellanox.com> diff --git a/man/man8/genl.8 b/man/man8/genl.8 new file mode 100644 index 0000000..b9de594 --- /dev/null +++ b/man/man8/genl.8 @@ -0,0 +1,77 @@ +.TH GENL 8 "29 Oct 2015" "iproute2" "Linux" +.SH NAME +genl \- generic netlink utility frontend +.SH SYNOPSIS +.in +8 +.ti -8 +.BR genl " [ " -s [ tatistics "] ] [ " -d [ etails "] ] [ " -r [ aw "] ] " OBJECT + +.ti -8 +.BR genl " { " -V [ ersion "] | " -h [ elp "] }" + +.ti -8 +.IR OBJECT " := { " +.B ctrl +.IR CTRL_OPTS " }" + +.ti -8 +.IR CTRL_OPTS " := { " +.BR help " | " list " | " monitor " | " get +.IR PARMS " }" + +.ti -8 +.IR PARMS " := { " +.B name +.IR NAME " | " +.B id +.IR ID " }" +.SH DESCRIPTION +The +.B genl +utility provides a simple frontend to the generic netlink library. Although it's +designed to support multiple +.IR OBJECT s, +for now only the +.B ctrl +object is available, which is used to query the generic netlink controller. +.SS ctrl +The generic netlink controller can be queried in various ways: +.TP +.B help +This command just prints a help text for the +.B ctrl +object. +.TP +.B list +Show the registered netlink users. +.TP +.B monitor +Listen for generic netlink notifications. +.TP +.B get +Query the controller for a given user, identified either by +.BR name " or " id . +.SH OPTIONS +genl supports the following options. +.TP +.B \-h, \-help +Show summary of options. +.TP +.B \-V, \-Version +Show version of program. +.TP +.B \-s, \-stats, \-statistics +Show object statistics. +.TP +.B \-d, \-details +Show object details. +.TP +.B \-r, \-raw +Dump raw output only. +.SH SEE ALSO +.BR ip (8) +.br +.SH AUTHOR +genl was written by Jamal Hadi Salim <hadi@cyberus.ca>. +.PP +This manual page was written by Petr Sabata <contyk@redhat.com>. diff --git a/man/man8/ifstat.8 b/man/man8/ifstat.8 new file mode 100644 index 0000000..8cd164d --- /dev/null +++ b/man/man8/ifstat.8 @@ -0,0 +1,77 @@ +.TH IFSTAT 8 "28 Oct 2015" "iproute2" "Linux" +.SH NAME +ifstat \- handy utility to read network interface statistics +.SH SYNOPSIS +.in +8 +.ti -8 +.BR ifstat " [ " +.IR OPTIONS " ] [ " INTERFACE_LIST " ]" + +.ti -8 +.IR INTERFACE_LIST " := " INTERFACE_LIST " | " interface +.SH DESCRIPTION +\fBifstat\fP neatly prints out network interface statistics. +The utility keeps records of the previous data displayed in history files and +by default only shows difference between the last and the current call. +Location of the history files defaults to /tmp/.ifstat.u$UID but may be +overridden with the IFSTAT_HISTORY environment variable. Similarly, the default +location for xstat (extended stats) is /tmp/.<xstat name>_ifstat.u$UID. +.SH OPTIONS +.TP +.B \-h, \-\-help +Show summary of options. +.TP +.B \-V, \-\-version +Show version of program. +.TP +.B \-a, \-\-ignore +Ignore the history file. +.TP +.B \-d, \-\-scan=SECS +Sample statistics every SECS second. +.TP +.B \-e, \-\-errors +Show errors. +.TP +.B \-n, \-\-nooutput +Don't display any output. Update the history file only. +.TP +.B \-r, \-\-reset +Reset history. +.TP +.B \-s, \-\-noupdate +Don't update the history file. +.TP +.B \-t, \-\-interval=SECS +Report average over the last SECS seconds. +.TP +.B \-z, \-\-zeros +Show entries with zero activity. +.TP +.B \-j, \-\-json +Display results in JSON format +.TP +.B \-p, \-\-pretty +If combined with +.BR \-\-json , +pretty print the output. +.TP +.B \-x, \-\-extended=TYPE +Show extended stats of TYPE. Supported types are: + +.in +8 +.B cpu_hits +- Counts only packets that went via the CPU. +.in -8 + +.SH ENVIRONMENT +.TP +.B IFSTAT_HISTORY +If set, it's value is interpreted as alternate history file path. +.SH SEE ALSO +.BR ip (8) +.br +.SH AUTHOR +ifstat was written by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>. +.PP +This manual page was written by Petr Sabata <contyk@redhat.com>. diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in new file mode 100644 index 0000000..a5ae47a --- /dev/null +++ b/man/man8/ip-address.8.in @@ -0,0 +1,514 @@ +.TH "IP\-ADDRESS" 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-address \- protocol address management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B address +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip address" " { " add " | " change " | " replace " } " +.IB IFADDR " dev " IFNAME +.RI "[ " LIFETIME " ] [ " CONFFLAG-LIST " ]" + +.ti -8 +.BR "ip address del" +.IB IFADDR " dev " IFNAME " [ " mngtmpaddr " ]" + +.ti -8 +.BR "ip address" " { " save " | " flush " } [ " dev +.IR IFNAME " ] [ " +.B scope +.IR SCOPE-ID " ] [ " +.B metric +.IR METRIC " ] [ " +.B to +.IR PREFIX " ] [ " FLAG-LIST " ] [ " +.B label +.IR PATTERN " ] [ " up " ]" + +.ti -8 +.BR "ip address" " [ " show " [ " dev +.IR IFNAME " ] [ " +.B scope +.IR SCOPE-ID " ] [ " +.B to +.IR PREFIX " ] [ " FLAG-LIST " ] [ " +.B label +.IR PATTERN " ] [ " +.B master +.IR DEVICE " ] [ " +.B type +.IR TYPE " ] [ " +.B vrf +.IR NAME " ] [ " +.BR up " ] [" +.BR nomaster " ]" +.B proto +.IR ADDRPROTO " ] ]" + +.ti -8 +.BR "ip address" " { " showdump " | " restore " }" + +.ti -8 +.IR IFADDR " := " PREFIX " | " ADDR +.B peer +.IR PREFIX " [ " +.B broadcast +.IR ADDR " ] [ " +.B anycast +.IR ADDR " ] [ " +.B label +.IR LABEL " ] [ " +.B scope +.IR SCOPE-ID " ] [ " +.B proto +.IR ADDRPROTO " ]" + +.ti -8 +.IR SCOPE-ID " := " +.RB "[ " host " | " link " | " global " | " +.IR NUMBER " ]" + +.ti -8 +.IR ADDRPROTO " := [ " +.IR NAME " | " NUMBER " ]" + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " := [" +.RB [ - ] permanent " |" +.RB [ - ] dynamic " |" +.RB [ - ] secondary " |" +.RB [ - ] primary " |" +.RB [ - ] tentative " |" +.RB [ - ] deprecated " |" +.RB [ - ] dadfailed " |" +.RB [ - ] temporary " |" +.IR CONFFLAG-LIST " ]" + +.ti -8 +.IR CONFFLAG-LIST " := [ " CONFFLAG-LIST " ] " CONFFLAG + +.ti -8 +.IR CONFFLAG " := " +.RB "[ " home " | " mngtmpaddr " | " nodad " | " optimistic " | " noprefixroute " | " autojoin " ]" + +.ti -8 +.IR LIFETIME " := [ " +.BI valid_lft " LFT" +.RB "] [ " preferred_lft +.IR LFT " ]" + +.ti -8 +.IR LFT " := [ " +.BR forever " |" +.IR SECONDS " ]" + +.ti -8 +.IR TYPE " := [ " +.BR bridge " | " +.BR bridge_slave " |" +.BR bond " | " +.BR bond_slave " |" +.BR can " | " +.BR dummy " | " +.BR hsr " | " +.BR ifb " | " +.BR ipoib " |" +.BR macvlan " | " +.BR macvtap " | " +.BR vcan " | " +.BR veth " | " +.BR vlan " | " +.BR vxlan " |" +.BR ip6tnl " |" +.BR ipip " |" +.BR sit " |" +.BR gre " |" +.BR gretap " |" +.BR erspan " |" +.BR ip6gre " |" +.BR ip6gretap " |" +.BR ip6erspan " |" +.BR vti " |" +.BR vrf " |" +.BR nlmon " |" +.BR ipvlan " |" +.BR lowpan " |" +.BR geneve " |" +.BR macsec " |" +.BR netkit " ]" + +.SH "DESCRIPTION" +The +.B address +is a protocol (IPv4 or IPv6) address attached +to a network device. Each device must have at least one address +to use the corresponding protocol. It is possible to have several +different addresses attached to one device. These addresses are not +discriminated, so that the term +.B alias +is not quite appropriate for them and we do not use it in this document. +.sp +The +.B ip address +command displays addresses and their properties, adds new addresses +and deletes old ones. + +.SS ip address add - add new protocol address. + +.TP +.BI dev " IFNAME " +the name of the device to add the address to. + +.TP +.BI local " ADDRESS " (default) +the address of the interface. The format of the address depends +on the protocol. It is a dotted quad for IP and a sequence of +hexadecimal halfwords separated by colons for IPv6. The +.I ADDRESS +may be followed by a slash and a decimal number which encodes +the network prefix length. + +.TP +.BI peer " ADDRESS" +the address of the remote endpoint for pointopoint interfaces. +Again, the +.I ADDRESS +may be followed by a slash and a decimal number, encoding the network +prefix length. If a peer address is specified, the local address +cannot have a prefix length. The network prefix is associated +with the peer rather than with the local address. + +.TP +.BI broadcast " ADDRESS" +the broadcast address on the interface. +.sp +It is possible to use the special symbols +.B '+' +and +.B '-' +instead of the broadcast address. In this case, the broadcast address +is derived by setting/resetting the host bits of the interface prefix. + +.TP +.BI label " LABEL" +Each address may be tagged with a label string. +The maximum allowed total length of label is 15 characters. + +.TP +.BI scope " SCOPE_VALUE" +the scope of the area where this address is valid. +The available scopes are listed in +.BR @SYSCONF_USR_DIR@/rt_scopes " or " @SYSCONF_ETC_DIR@/rt_scopes +(has precedence if exists). +Predefined scope values are: + +.in +8 +.B global +- the address is globally valid. +.sp +.B site +- (IPv6 only, deprecated) the address is site local, i.e. it is +valid inside this site. +.sp +.B link +- the address is link local, i.e. it is valid only on this device. +.sp +.B host +- the address is valid only inside this host. +.in -8 + +.TP +.BI metric " NUMBER" +priority of prefix route associated with address. + +.TP +.BI valid_lft " LFT" +the valid lifetime of this address; see section 5.5.4 of +RFC 4862. When it expires, the address is removed by the kernel. +Defaults to +.BR "forever" . + +.TP +.BI preferred_lft " LFT" +the preferred lifetime of this address; see section 5.5.4 +of RFC 4862. When it expires, the address is no longer used for new +outgoing connections. Defaults to +.BR "forever" . + +.TP +.B home +(IPv6 only) designates this address the "home address" as defined in +RFC 6275. + +.TP +.B mngtmpaddr +(IPv6 only) make the kernel manage temporary addresses created from this one as +template on behalf of Privacy Extensions (RFC3041). For this to become active, +the \fBuse_tempaddr\fP sysctl setting has to be set to a value greater than +zero. The given address needs to have a prefix length of 64. This flag allows +to use privacy extensions in a manually configured network, just like if +stateless auto-configuration was active. + +.TP +.B nodad +(IPv6 only) do not perform Duplicate Address Detection (RFC 4862) when +adding this address. + +.TP +.B optimistic +(IPv6 only) When performing Duplicate Address Detection, use the RFC 4429 +optimistic variant. + +.TP +.B noprefixroute +Do not automatically create a route for the network prefix of the added +address, and don't search for one to delete when removing the address. Changing +an address to add this flag will remove the automatically added prefix route, +changing it to remove this flag will create the prefix route automatically. + +.TP +.B autojoin +Joining multicast groups on Ethernet level via +.B "ip maddr" +command does not work if connected to an Ethernet switch that does IGMP +snooping since the switch would not replicate multicast packets on ports that +did not have IGMP reports for the multicast addresses. + +Linux VXLAN interfaces created via +.B "ip link add vxlan" +have the +.B group +option that enables them to do the required join. + +Using the +.B autojoin +flag when adding a multicast address enables similar functionality for +Openvswitch VXLAN interfaces as well as other tunneling mechanisms that need to +receive multicast traffic. + +.TP +.BI proto " ADDRPROTO" +the protocol identifier of this route. +.I ADDRPROTO +may be a number or a string from the file +.BR "/etc/iproute2/rt_addrprotos" . +If the protocol ID is not given, + +.B ip assumes protocol 0. Several protocol +values have a fixed interpretation. Namely: + +.in +8 +.B kernel_lo +- The ::1 address that kernel installs on a loopback netdevice has this + protocol value +.sp + +.B kernel_ra +- IPv6 addresses installed in response to router advertisement messages +.sp + +.B kernel_ll +- Link-local addresses have this protocol value +.sp +.in -8 + +.sp +The rest of the values are not reserved and the administrator is free +to assign (or not to assign) protocol tags. + +.SS ip address delete - delete protocol address +.B Arguments: +coincide with the arguments of +.B ip addr add. +The device name is a required argument. The rest are optional. +If no arguments are given, the first address is deleted. + +.SS ip address show - look at protocol addresses + +.TP +.BI dev " IFNAME " (default) +name of device. + +.TP +.BI scope " SCOPE_VAL" +only list addresses with this scope. + +.TP +.BI to " PREFIX" +only list addresses matching this prefix. + +.TP +.BI label " PATTERN" +only list addresses with labels matching the +.IR "PATTERN" . +.I PATTERN +is a usual shell style pattern. + +.TP +.BI master " DEVICE" +only list interfaces enslaved to this master device. + +.TP +.BI vrf " NAME " +only list interfaces enslaved to this vrf. + +.TP +.BI type " TYPE" +only list interfaces of the given type. + +Note that the type name is not checked against the list of supported types - +instead it is sent as-is to the kernel. Later it is used to filter the returned +interface list by comparing it with the relevant attribute in case the kernel +didn't filter already. Therefore any string is accepted, but may lead to empty +output. + +.TP +.B up +only list running interfaces. + +.TP +.B nomaster +only list interfaces with no master. + +.TP +.BR dynamic " and " permanent +(IPv6 only) only list addresses installed due to stateless +address configuration or only list permanent (not dynamic) +addresses. These two flags are inverses of each other, so +.BR -dynamic " is equal to " permanent " and " +.BR -permanent " is equal to " dynamic . + +.TP +.B tentative +(IPv6 only) only list addresses which have not yet passed duplicate +address detection. + +.TP +.B -tentative +(IPv6 only) only list addresses which are not in the process of +duplicate address detection currently. + +.TP +.B deprecated +(IPv6 only) only list deprecated addresses. + +.TP +.B -deprecated +(IPv6 only) only list addresses not being deprecated. + +.TP +.B dadfailed +(IPv6 only) only list addresses which have failed duplicate +address detection. + +.TP +.B -dadfailed +(IPv6 only) only list addresses which have not failed duplicate +address detection. + +.TP +.BR temporary " or " secondary +List temporary IPv6 or secondary IPv4 addresses only. The Linux kernel shares a +single bit for those, so they are actually aliases for each other although the +meaning differs depending on address family. + +.TP +.BR -temporary " or " -secondary +These flags are aliases for +.BR primary . + +.TP +.B primary +List only primary addresses, in IPv6 exclude temporary ones. This flag is the +inverse of +.BR temporary " and " secondary . + +.TP +.B -primary +This is an alias for +.BR temporary " or " secondary . + +.TP +.BI proto " ADDRPROTO" +Only show addresses with a given protocol, or those for which the kernel +response did not include protocol. See the corresponding argument to +.B ip addr add +for details about address protocols. + +.SS ip address flush - flush protocol addresses +This command flushes the protocol addresses selected by some criteria. + +.PP +This command has the same arguments as +.BR show " except that " type " and " master " selectors are not supported." +Another difference is that it does not run when no arguments are given. + +.PP +.B Warning: +This command and other +.B flush +commands are unforgiving. They will cruelly purge all the addresses. + +.PP +With the +.B -statistics +option, the command becomes verbose. It prints out the number of deleted +addresses and the number of rounds made to flush the address list. +If this option is given twice, +.B ip address flush +also dumps all the deleted addresses in the format described in the +previous subsection. + +.SH "EXAMPLES" +.PP +ip address show +.RS 4 +Shows IPv4 and IPv6 addresses assigned to all network interfaces. The 'show' +subcommand can be omitted. +.RE +.PP +ip address show up +.RS 4 +Same as above except that only addresses assigned to active network interfaces +are shown. +.RE +.PP +ip address show dev eth0 +.RS 4 +Shows IPv4 and IPv6 addresses assigned to network interface eth0. +.RE +.PP +ip address add 2001:0db8:85a3::0370:7334/64 dev eth1 +.RS 4 +Adds an IPv6 address to network interface eth1. +.RE +.PP +ip address delete 2001:0db8:85a3::0370:7334/64 dev eth1 +.RS 4 +Delete the IPv6 address added above. +.RE +.PP +ip address flush dev eth4 scope global +.RS 4 +Removes all global IPv4 and IPv6 addresses from device eth4. Without 'scope +global' it would remove all addresses including IPv6 link-local ones. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-addrlabel.8 b/man/man8/ip-addrlabel.8 new file mode 100644 index 0000000..233d606 --- /dev/null +++ b/man/man8/ip-addrlabel.8 @@ -0,0 +1,56 @@ +.TH IP\-ADDRLABEL 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-addrlabel \- protocol address label management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip addrlabel +.RI " { " COMMAND " | " +.BR help " }" + +.ti -8 +.BR "ip addrlabel" " { " add " | " del " } " prefix +.BR PREFIX " [ " +.B dev +.IR DEV " ] [ " +.B label +.IR NUMBER " ]" + +.ti -8 +.BR "ip addrlabel" " { " list " | " flush " }" + +.SH "DESCRIPTION" +IPv6 address labels are used for address selection; +they are described in RFC 3484. Precedence is managed by userspace, +and only the label itself is stored in the kernel. + +.SS ip addrlabel add - add an address label +add an address label entry to the kernel. +.TP +.BI prefix " PREFIX" +.TP +.BI dev " DEV" +the outgoing interface. +.TP +.BI label " NUMBER" +the label for the prefix. +0xffffffff is reserved. +.SS ip addrlabel del - delete an address label +delete an address label entry from the kernel. +.B Arguments: +coincide with the arguments of +.B ip addrlabel add +but the label is not required. +.SS ip addrlabel list - list address labels +list the current address label entries in the kernel. +.SS ip addrlabel flush - flush address labels +flush all address labels in the kernel. This does not restore any default settings. + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Manpage by Yoshifuji Hideaki / 吉藤英明 diff --git a/man/man8/ip-fou.8 b/man/man8/ip-fou.8 new file mode 100644 index 0000000..f4e08f1 --- /dev/null +++ b/man/man8/ip-fou.8 @@ -0,0 +1,126 @@ +.TH IP\-FOU 8 "2 Nov 2014" "iproute2" "Linux" +.SH "NAME" +ip-fou \- Foo-over-UDP receive port configuration +.P +ip-gue \- Generic UDP Encapsulation receive port configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B fou +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 +.BR "ip fou add" +.B port +.IR PORT +.RB "{ " +.B gue +.RI "|" +.B ipproto +.IR PROTO +.RB " }" +.RB "[ " +.B local +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer_port +.IR PORT +.RB " ]" +.RB "[ " +.B dev +.IR IFNAME +.RB " ]" +.br +.ti -8 +.BR "ip fou del" +.B port +.IR PORT +.RB "[ " +.B local +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer_port +.IR PORT +.RB " ]" +.RB "[ " +.B dev +.IR IFNAME +.RB " ]" +.br +.ti -8 +.B ip fou show +.SH DESCRIPTION +The +.B ip fou +commands are used to create and delete receive ports for Foo-over-UDP +(FOU) as well as Generic UDP Encapsulation (GUE). +.PP +Foo-over-UDP allows encapsulating packets of an IP protocol directly +over UDP. The receiver infers the protocol of a packet received on +a FOU UDP port to be the protocol configured for the port. +.PP +Generic UDP Encapsulation (GUE) encapsulates packets of an IP protocol +within UDP and an encapsulation header. The encapsulation header contains the +IP protocol number for the encapsulated packet. +.PP +When creating a FOU or GUE receive port, the port number is specified in +.I PORT +argument. If FOU is used, the IP protocol number associated with the port is specified in +.I PROTO +argument. You can bind a port to a local address/interface, by specifying the +address in the local +.I IFADDR +argument or the device in the +.I IFNAME +argument. If you would like to connect the port, you can specify the peer +address in the peer +.I IFADDR +argument and peer port in the peer_port +.I PORT +argument. +.PP +A FOU or GUE receive port is deleted by specifying +.I PORT +in the delete command, as well as local address/interface or peer address/port +(if set). +.SH EXAMPLES +.PP +.SS Configure a FOU receive port for GRE bound to 7777 +.nf +# ip fou add port 7777 ipproto 47 +.PP +.SS Configure a FOU receive port for IPIP bound to 8888 +.nf +# ip fou add port 8888 ipproto 4 +.PP +.SS Configure a GUE receive port bound to 9999 +.nf +# ip fou add port 9999 gue +.PP +.SS Delete the GUE receive port bound to 9999 +.nf +# ip fou del port 9999 +.SS Configure a FOU receive port for GRE bound to 1.2.3.4:7777 +.nf +# ip fou add port 7777 ipproto 47 local 1.2.3.4 +.PP +.SH SEE ALSO +.br +.BR ip (8) +.SH AUTHOR +Tom Herbert <therbert@google.com> diff --git a/man/man8/ip-gue.8 b/man/man8/ip-gue.8 new file mode 100644 index 0000000..4d2914c --- /dev/null +++ b/man/man8/ip-gue.8 @@ -0,0 +1 @@ +.so man8/ip-fou.8 diff --git a/man/man8/ip-ioam.8 b/man/man8/ip-ioam.8 new file mode 100644 index 0000000..1bdc0ec --- /dev/null +++ b/man/man8/ip-ioam.8 @@ -0,0 +1,72 @@ +.TH IP\-IOAM 8 "05 Jul 2021" "iproute2" "Linux" +.SH "NAME" +ip-ioam \- IPv6 In-situ OAM (IOAM) +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B ip ioam +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 + +.ti -8 +.B ip ioam namespace show + +.ti -8 +.B ip ioam namespace add +.I ID +.BR " [ " +.B data +.I DATA32 +.BR "]" +.BR " [ " +.B wide +.I DATA64 +.BR "]" + +.ti -8 +.B ip ioam namespace del +.I ID + +.ti -8 +.B ip ioam schema show + +.ti -8 +.B ip ioam schema add +.I ID DATA + +.ti -8 +.B ip ioam schema del +.I ID + +.ti -8 +.B ip ioam namespace set +.I ID +.B schema +.RI " { " ID " | " +.BR none " }" + +.SH DESCRIPTION +The \fBip ioam\fR command is used to configure IPv6 In-situ OAM (IOAM6) +internal parameters, namely IOAM namespaces and schemas. +.PP +Those parameters also include the mapping between an IOAM namespace and an IOAM +schema. + +.SH EXAMPLES +.PP +.SS Configure an IOAM namespace (ID = 1) with both data (32 bits) and wide data (64 bits) +.nf +# ip ioam namespace add 1 data 0xdeadbeef wide 0xcafec0caf00dc0de +.PP +.SS Link an existing IOAM schema (ID = 7) to an existing IOAM namespace (ID = 1) +.nf +# ip ioam namespace set 1 schema 7 +.SH SEE ALSO +.br +.BR ip-route (8) +.SH AUTHOR +Justin Iurman <justin.iurman@uliege.be> diff --git a/man/man8/ip-l2tp.8 b/man/man8/ip-l2tp.8 new file mode 100644 index 0000000..7109c0a --- /dev/null +++ b/man/man8/ip-l2tp.8 @@ -0,0 +1,412 @@ +.TH IP\-L2TP 8 "19 Apr 2012" "iproute2" "Linux" +.SH "NAME" +ip-l2tp - L2TPv3 static unmanaged tunnel configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B l2tp +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 +.BR "ip l2tp add tunnel" +.br +.BI remote " ADDR " local " ADDR " +.br +.B tunnel_id +.IR ID +.B peer_tunnel_id +.IR ID +.br +.RB "[ " encap " { " ip " | " udp " } ]" +.br +.RB "[ " udp_sport +.IR PORT +.RB " ] [ " udp_dport +.IR PORT +.RB " ]" +.br +.RB "[ " udp_csum " { " on " | " off " } ]" +.br +.RB "[ " udp6_csum_tx " { " on " | " off " } ]" +.br +.RB "[ " udp6_csum_rx " { " on " | " off " } ]" +.br +.ti -8 +.BR "ip l2tp add session" +.RB "[ " name +.IR NAME +.RB " ]" +.br +.B tunnel_id +.IR ID +.B session_id +.IR ID +.B peer_session_id +.IR ID +.br +.RB "[ " cookie +.IR HEXSTR +.RB " ] [ " peer_cookie +.IR HEXSTR +.RB " ]" +.br +.RB "[ " l2spec_type " { " none " | " default " } ]" +.br +.RB "[ " seq " { " none " | " send " | " recv " | " both " } ]" +.br +.ti -8 +.BR "ip l2tp del tunnel" +.B tunnel_id +.IR ID +.br +.ti -8 +.BR "ip l2tp del session" +.B tunnel_id +.IR ID +.B session_id +.IR ID +.br +.ti -8 +.BR "ip l2tp show tunnel" " [ " tunnel_id +.IR ID " ]" +.br +.ti -8 +.BR "ip l2tp show session" " [ " tunnel_id +.IR ID .B " ] [" +.B session_id +.IR ID " ]" +.br +.ti -8 +.IR NAME " := " +.IR STRING +.ti -8 +.IR ADDR " := { " IP_ADDRESS " |" +.BR any " }" +.ti -8 +.IR PORT " := { " NUMBER " }" +.ti -8 +.IR ID " := { " NUMBER " }" +.ti -8 +.ti -8 +.IR HEXSTR " := { 8 or 16 hex digits (4 / 8 bytes) }" +.SH DESCRIPTION +The +.B ip l2tp +commands are used to establish static, or so-called +.I unmanaged +L2TPv3 ethernet tunnels. For unmanaged tunnels, there is no L2TP +control protocol so no userspace daemon is required - tunnels are +manually created by issuing commands at a local system and at a remote +peer. +.PP +L2TPv3 is suitable for Layer-2 tunneling. Static tunnels are useful +to establish network links across IP networks when the tunnels are +fixed. L2TPv3 tunnels can carry data of more than one session. Each +session is identified by a session_id and its parent tunnel's +tunnel_id. A tunnel must be created before a session can be created in +the tunnel. +.PP +When creating an L2TP tunnel, the IP address of the remote peer is +specified, which can be either an IPv4 or IPv6 address. The local IP +address to be used to reach the peer must also be specified. This is +the address on which the local system will listen for and accept +received L2TP data packets from the peer. +.PP +L2TPv3 defines two packet encapsulation formats: UDP or IP. UDP +encapsulation is most common. IP encapsulation uses a dedicated IP +protocol value to carry L2TP data without the overhead of UDP. Use IP +encapsulation only when there are no NAT devices or firewalls in the +network path. +.PP +When an L2TPv3 ethernet session is created, a virtual network +interface is created for the session, which must then be configured +and brought up, just like any other network interface. When data is +passed through the interface, it is carried over the L2TP tunnel to +the peer. By configuring the system's routing tables or adding the +interface to a bridge, the L2TP interface is like a virtual wire +(pseudowire) connected to the peer. +.PP +Establishing an unmanaged L2TPv3 ethernet pseudowire involves manually +creating L2TP contexts on the local system and at the peer. Parameters +used at each site must correspond or no data will be passed. No +consistency checks are possible since there is no control protocol +used to establish unmanaged L2TP tunnels. Once the virtual network +interface of a given L2TP session is configured and enabled, data can +be transmitted, even if the peer isn't yet configured. If the peer +isn't configured, the L2TP data packets will be discarded by +the peer. +.PP +To establish an unmanaged L2TP tunnel, use +.B l2tp add tunnel +and +.B l2tp add session +commands described in this document. Then configure and enable the +tunnel's virtual network interface, as required. +.PP +Note that unmanaged tunnels carry only ethernet frames. If you need to +carry PPP traffic (L2TPv2) or your peer doesn't support unmanaged +L2TPv3 tunnels, you will need an L2TP server which implements the L2TP +control protocol. The L2TP control protocol allows dynamic L2TP +tunnels and sessions to be established and provides for detecting and +acting upon network failures. +.SS ip l2tp add tunnel - add a new tunnel +.TP +.BI tunnel_id " ID" +set the tunnel id, which is a 32-bit integer value. Uniquely +identifies the tunnel. The value used must match the peer_tunnel_id +value being used at the peer. +.TP +.BI peer_tunnel_id " ID" +set the peer tunnel id, which is a 32-bit integer value assigned to +the tunnel by the peer. The value used must match the tunnel_id value +being used at the peer. +.TP +.BI remote " ADDR" +set the IP address of the remote peer. May be specified as an IPv4 +address or an IPv6 address. +.TP +.BI local " ADDR" +set the IP address of the local interface to be used for the +tunnel. This address must be the address of a local interface. May be +specified as an IPv4 address or an IPv6 address. +.TP +.BI encap " ENCAP" +set the encapsulation type of the tunnel. +.br +Valid values for encapsulation are: +.BR udp ", " ip "." +.TP +.BI udp_sport " PORT" +set the UDP source port to be used for the tunnel. Must be present +when udp encapsulation is selected. Ignored when ip encapsulation is +selected. +.TP +.BI udp_dport " PORT" +set the UDP destination port to be used for the tunnel. Must be +present when udp encapsulation is selected. Ignored when ip +encapsulation is selected. +.TP +.BI udp_csum " STATE" +(IPv4 only) control if IPv4 UDP checksums should be calculated and checked for the +encapsulating UDP packets, when UDP encapsulating is selected. +Default is +.BR off "." +.br +Valid values are: +.BR on ", " off "." +.TP +.BI udp6_csum_tx " STATE" +(IPv6 only) control if IPv6 UDP checksums should be calculated for encapsulating +UDP packets, when UDP encapsulating is selected. +Default is +.BR on "." +.br +Valid values are: +.BR on ", " off "." +.TP +.BI udp6_csum_rx " STATE" +(IPv6 only) control if IPv6 UDP checksums should be checked for the encapsulating +UDP packets, when UDP encapsulating is selected. +Default is +.BR on "." +.br +Valid values are: +.BR on ", " off "." +.SS ip l2tp del tunnel - destroy a tunnel +.TP +.BI tunnel_id " ID" +set the tunnel id of the tunnel to be deleted. All sessions within the +tunnel must be deleted first. +.SS ip l2tp show tunnel - show information about tunnels +.TP +.BI tunnel_id " ID" +set the tunnel id of the tunnel to be shown. If not specified, +information about all tunnels is printed. +.SS ip l2tp add session - add a new session to a tunnel +.TP +.BI name " NAME " +sets the session network interface name. Default is l2tpethN. +.TP +.BI tunnel_id " ID" +set the tunnel id, which is a 32-bit integer value. Uniquely +identifies the tunnel into which the session will be created. The +tunnel must already exist. +.TP +.BI session_id " ID" +set the session id, which is a 32-bit integer value. Uniquely +identifies the session being created. The value used must match the +peer_session_id value being used at the peer. +.TP +.BI peer_session_id " ID" +set the peer session id, which is a 32-bit integer value assigned to +the session by the peer. The value used must match the session_id +value being used at the peer. +.TP +.BI cookie " HEXSTR" +sets an optional cookie value to be assigned to the session. This is a +4 or 8 byte value, specified as 8 or 16 hex digits, +e.g. 014d3636deadbeef. The value must match the peer_cookie value set +at the peer. The cookie value is carried in L2TP data packets and is +checked for expected value at the peer. Default is to use no cookie. +.TP +.BI peer_cookie " HEXSTR" +sets an optional peer cookie value to be assigned to the session. This +is a 4 or 8 byte value, specified as 8 or 16 hex digits, +e.g. 014d3636deadbeef. The value must match the cookie value set at +the peer. It tells the local system what cookie value to expect to +find in received L2TP packets. Default is to use no cookie. +.TP +.BI l2spec_type " L2SPECTYPE" +set the layer2specific header type of the session. +.br +Valid values are: +.BR none ", " default "." +.TP +.BI seq " SEQ" +controls sequence numbering to prevent or detect out of order packets. +.B send +puts a sequence number in the default layer2specific header of each +outgoing packet. +.B recv +reorder packets if they are received out of order. +Default is +.BR none "." +.br +Valid values are: +.BR none ", " send ", " recv ", " both "." +.SS ip l2tp del session - destroy a session +.TP +.BI tunnel_id " ID" +set the tunnel id in which the session to be deleted is located. +.TP +.BI session_id " ID" +set the session id of the session to be deleted. +.SS ip l2tp show session - show information about sessions +.TP +.BI tunnel_id " ID" +set the tunnel id of the session(s) to be shown. If not specified, +information about sessions in all tunnels is printed. +.TP +.BI session_id " ID" +set the session id of the session to be shown. If not specified, +information about all sessions is printed. +.SH EXAMPLES +.PP +.SS Setup L2TP tunnels and sessions +.nf +site-A:# ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 \\ + encap udp local 1.2.3.4 remote 5.6.7.8 \\ + udp_sport 5000 udp_dport 6000 +site-A:# ip l2tp add session tunnel_id 3000 session_id 1000 \\ + peer_session_id 2000 + +site-B:# ip l2tp add tunnel tunnel_id 4000 peer_tunnel_id 3000 \\ + encap udp local 5.6.7.8 remote 1.2.3.4 \\ + udp_sport 6000 udp_dport 5000 +site-B:# ip l2tp add session tunnel_id 4000 session_id 2000 \\ + peer_session_id 1000 + +site-A:# ip link set l2tpeth0 up mtu 1488 + +site-B:# ip link set l2tpeth0 up mtu 1488 +.fi +.PP +Notice that the IP addresses, UDP ports and tunnel / session ids are +matched and reversed at each site. +.SS Configure as IP interfaces +The two interfaces can be configured with IP addresses if only IP data +is to be carried. This is perhaps the simplest configuration. +.PP +.nf +site-A:# ip addr add 10.42.1.1 peer 10.42.1.2 dev l2tpeth0 + +site-B:# ip addr add 10.42.1.2 peer 10.42.1.1 dev l2tpeth0 + +site-A:# ping 10.42.1.2 +.fi +.PP +Now the link should be usable. Add static routes as needed to have +data sent over the new link. +.PP +.SS Configure as bridged interfaces +To carry non-IP data, the L2TP network interface is added to a bridge +instead of being assigned its own IP address, using standard Linux +utilities. Since raw ethernet frames are then carried inside the +tunnel, the MTU of the L2TP interfaces must be set to allow space for +those headers. +.PP +.nf +site-A:# ip link set l2tpeth0 up mtu 1446 +site-A:# ip link add br0 type bridge +site-A:# ip link set l2tpeth0 master br0 +site-A:# ip link set eth0 master br0 +site-A:# ip link set br0 up +.fi +.PP +If you are using VLANs, setup a bridge per VLAN and bridge each VLAN +over a separate L2TP session. For example, to bridge VLAN ID 5 on eth1 +over an L2TP pseudowire: +.PP +.nf +site-A:# ip link set l2tpeth0 up mtu 1446 +site-A:# ip link add brvlan5 type bridge +site-A:# ip link set l2tpeth0.5 master brvlan5 +site-A:# ip link set eth1.5 master brvlan5 +site-A:# ip link set brvlan5 up +.fi +.PP +Adding the L2TP interface to a bridge causes the bridge to forward +traffic over the L2TP pseudowire just like it forwards over any other +interface. The bridge learns MAC addresses of hosts attached to each +interface and intelligently forwards frames from one bridge port to +another. IP addresses are not assigned to the l2tpethN interfaces. If +the bridge is correctly configured at both sides of the L2TP +pseudowire, it should be possible to reach hosts in the peer's bridged +network. +.PP +When raw ethernet frames are bridged across an L2TP tunnel, large +frames may be fragmented and forwarded as individual IP fragments to +the recipient, depending on the MTU of the physical interface used by +the tunnel. When the ethernet frames carry protocols which are +reassembled by the recipient, like IP, this isn't a problem. However, +such fragmentation can cause problems for protocols like PPPoE where +the recipient expects to receive ethernet frames exactly as +transmitted. In such cases, it is important that frames leaving the +tunnel are reassembled back into a single frame before being +forwarded on. To do so, enable netfilter connection tracking +(conntrack) or manually load the Linux netfilter defrag modules at +each tunnel endpoint. +.PP +.nf +site-A:# modprobe nf_defrag_ipv4 + +site-B:# modprobe nf_defrag_ipv4 +.fi +.PP +If L2TP is being used over IPv6, use the IPv6 defrag module. +.SH INTEROPERABILITY +.PP +Unmanaged (static) L2TPv3 tunnels are supported by some network +equipment vendors such as Cisco. +.PP +In Linux, L2TP Hello messages are not supported in unmanaged +tunnels. Hello messages are used by L2TP clients and servers to detect +link failures in order to automate tearing down and reestablishing +dynamic tunnels. If a non-Linux peer supports Hello messages in +unmanaged tunnels, it must be turned off to interoperate with Linux. +.PP +Linux defaults to use the Default Layer2SpecificHeader type as defined +in the L2TPv3 protocol specification, RFC3931. This setting must be +consistent with that configured at the peer. Some vendor +implementations (e.g. Cisco) default to use a Layer2SpecificHeader +type of None. +.SH SEE ALSO +.br +.BR ip (8) +.SH AUTHOR +James Chapman <jchapman@katalix.com> diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in new file mode 100644 index 0000000..31e2d7f --- /dev/null +++ b/man/man8/ip-link.8.in @@ -0,0 +1,3030 @@ +.TH IP\-LINK 8 "13 Dec 2012" "iproute2" "Linux" +.SH "NAME" +ip-link \- network device configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip link +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BI "ip link add" +.RB "[ " link +.IR DEVICE " ]" +.RB "[ " name " ]" +.I NAME +.br +.RB "[ " txqueuelen +.IR PACKETS " ]" +.br +.RB "[ " address +.IR LLADDR " ]" +.RB "[ " broadcast +.IR LLADDR " ]" +.br +.RB "[ " mtu +.IR MTU " ]" +.RB "[ " index +.IR IDX " ]" +.br +.RB "[ " numtxqueues +.IR QUEUE_COUNT " ]" +.RB "[ " numrxqueues +.IR QUEUE_COUNT " ]" +.br +.RB "[ " gso_max_size +.IR BYTES " ]" +.RB "[ " gso_ipv4_max_size +.IR BYTES " ]" +.RB "[ " gso_max_segs +.IR SEGMENTS " ]" +.br +.RB "[ " gro_max_size +.IR BYTES " ]" +.RB "[ " gro_ipv4_max_size +.IR BYTES " ]" +.br +.RB "[ " netns " {" +.IR PID " | " NETNSNAME " | " NETNSFILE " } ]" +.br +.BI type " TYPE" +.RI "[ " ARGS " ]" + +.ti -8 +.BR "ip link delete " { +.IR DEVICE " | " +.BI "group " GROUP +} +.BI type " TYPE" +.RI "[ " ARGS " ]" + +.ti -8 +.BR "ip link set " { +.IR DEVICE " | " +.BI "group " GROUP +} +.br +.RB "[ { " up " | " down " } ]" +.br +.RB "[ " type +.IR "ETYPE TYPE_ARGS" " ]" +.br +.RB "[ " arp " { " on " | " off " } ]" +.br +.RB "[ " dynamic " { " on " | " off " } ]" +.br +.RB "[ " multicast " { " on " | " off " } ]" +.br +.RB "[ " allmulticast " { " on " | " off " } ]" +.br +.RB "[ " promisc " { " on " | " off " } ]" +.br +.RB "[ " protodown " { " on " | " off " } ]" +.br +.RB "[ " protodown_reason +.IR PREASON " { " on " | " off " } ]" +.br +.RB "[ " trailers " { " on " | " off " } ]" +.br +.RB "[ " txqueuelen +.IR PACKETS " ]" +.br +.RB "[ " gso_max_size +.IR BYTES " ]" +.RB "[ " gso_ipv4_max_size +.IR BYTES " ]" +.RB "[ " gso_max_segs +.IR SEGMENTS " ]" +.br +.RB "[ " gro_max_size +.IR BYTES " ]" +.RB "[ " gro_ipv4_max_size +.IR BYTES " ]" +.br +.RB "[ " name +.IR NEWNAME " ]" +.br +.RB "[ " address +.IR LLADDR " ]" +.br +.RB "[ " broadcast +.IR LLADDR " ]" +.br +.RB "[ " mtu +.IR MTU " ]" +.br +.RB "[ " netns " {" +.IR PID " | " NETNSNAME " | " NETNSFILE " } ]" +.br +.RB "[ " link-netnsid +.IR ID " ]" +.br +.RB "[ " alias +.IR NAME " ]" +.br +.RB "[ " vf +.IR NUM " [" +.B mac +.IR LLADDR " ]" +.br +.in +9 +.RI "[ " VFVLAN-LIST " ]" +.br +.RB "[ " rate +.IR TXRATE " ]" +.br +.RB "[ " max_tx_rate +.IR TXRATE " ]" +.br +.RB "[ " min_tx_rate +.IR TXRATE " ]" +.br +.RB "[ " spoofchk " { " on " | " off " } ]" +.br +.RB "[ " query_rss " { " on " | " off " } ]" +.br +.RB "[ " state " { " auto " | " enable " | " disable " } ]" +.br +.RB "[ " trust " { " on " | " off " } ]" +.br +.RB "[ " node_guid " eui64 ]" +.br +.RB "[ " port_guid " eui64 ] ]" +.br +.in -9 +.RB "[ { " xdp " | " xdpgeneric " | " xdpdrv " | " xdpoffload " } { " off " | " +.br +.in +8 +.BR object +.IR FILE +.RB "[ { " section " | " program " } " +.IR NAME " ]" +.RB "[ " verbose " ] |" +.br +.BR pinned +.IR FILE " } ]" +.br +.in -8 +.RB "[ " master +.IR DEVICE " ]" +.br +.RB "[ " nomaster " ]" +.br +.RB "[ " vrf +.IR NAME " ]" +.br +.RB "[ " addrgenmode " { " eui64 " | " none " | " stable_secret " | " random " } ]" +.br +.RB "[ " macaddr +.RI "[ " MACADDR " ]" +.br +.in +10 +.RB "[ { " flush " | " add " | " del " } " +.IR MACADDR " ]" +.br +.RB "[ " set +.IR MACADDR " ] ]" +.br + +.ti -8 +.B ip link show +.RI "[ " DEVICE " | " +.B group +.IR GROUP " ] [" +.BR up " ] [" +.B master +.IR DEVICE " ] [" +.B type +.IR ETYPE " ] [" +.B vrf +.IR NAME " ] [" +.BR nomaster " ]" + +.ti -8 +.B ip link xstats +.BI type " TYPE" +.RI "[ " ARGS " ]" + +.ti -8 +.B ip link afstats +.RB "[ " dev +.IR DEVICE " ]" + +.ti -8 +.B ip link help +.RI "[ " TYPE " ]" + +.ti -8 +.IR TYPE " := [ " +.BR amt " | " +.BR bareudp " |" +.BR bond " | " +.BR bridge " | " +.BR can " | " +.BR dsa " | " +.BR dummy " | " +.BR erspan " |" +.BR geneve " |" +.BR gre " |" +.BR gretap " |" +.BR gtp " |" +.BR hsr " | " +.BR ifb " | " +.BR ip6erspan " |" +.BR ip6gre " |" +.BR ip6gretap " |" +.BR ip6tnl " |" +.BR ipip " |" +.BR ipoib " |" +.BR ipvlan " |" +.BR ipvtap " |" +.BR lowpan " |" +.BR macsec " |" +.BR macvlan " | " +.BR macvtap " | " +.BR netdevsim " |" +.BR netkit " |" +.BR nlmon " |" +.BR rmnet " |" +.BR sit " |" +.BR vcan " | " +.BR veth " | " +.BR virt_wifi " |" +.BR vlan " | " +.BR vrf " |" +.BR vti " |" +.BR vxcan " | " +.BR vxlan " |" +.BR xfrm " ]" + +.ti -8 +.IR ETYPE " := [ " TYPE " |" +.BR bridge_slave " | " bond_slave " ]" + +.ti -8 +.IR VFVLAN-LIST " := [ " VFVLAN-LIST " ] " VFVLAN + +.ti -8 +.IR VFVLAN " := " +.RB "[ " vlan +.IR VLANID " [ " +.B qos +.IR VLAN-QOS " ] [" +.B proto +.IR VLAN-PROTO " ] ]" +.in -8 + +.ti -8 +.BI "ip link property add dev " DEVICE +.RB "[ " altname +.IR NAME " .. ]" + +.ti -8 +.BI "ip link property del dev " DEVICE +.RB "[ " altname +.IR NAME " .. ]" + +.SH "DESCRIPTION" +.SS ip link add - add virtual link + +.TP +.BI link " DEVICE " +specifies the physical device to act operate on. + +.I NAME +specifies the name of the new virtual device. + +.I TYPE +specifies the type of the new device. +.sp +Link types: + +.in +8 +.BR amt +- Automatic Multicast Tunneling (AMT) +.sp +.BR bareudp +- Bare UDP L3 encapsulation support +.sp +.B bond +- Bonding device +.sp +.B bridge +- Ethernet Bridge device +.sp +.B can +- Controller Area Network +.sp +.B dsa +- Distributed Switch Architecture +.sp +.B dummy +- Dummy network interface +.sp +.BR erspan +- Encapsulated Remote SPAN over GRE and IPv4 +.sp +.B geneve +- GEneric NEtwork Virtualization Encapsulation +.sp +.B gre +- Virtual tunnel interface GRE over IPv4 +.sp +.BR gretap +- Virtual L2 tunnel interface GRE over IPv4 +.sp +.BR gtp +- GPRS Tunneling Protocol +.sp +.B hsr +- High-availability Seamless Redundancy device +.sp +.B ifb +- Intermediate Functional Block device +.sp +.BR ip6erspan +- Encapsulated Remote SPAN over GRE and IPv6 +.sp +.BR ip6gre +- Virtual tunnel interface GRE over IPv6 +.sp +.BR ip6gretap +- Virtual L2 tunnel interface GRE over IPv6 +.sp +.BR ip6tnl +- Virtual tunnel interface IPv4|IPv6 over IPv6 +.sp +.BR ipip +- Virtual tunnel interface IPv4 over IPv4 +.sp +.B ipoib +- IP over Infiniband device +.sp +.BR ipvlan +- Interface for L3 (IPv6/IPv4) based VLANs +.sp +.BR ipvtap +- Interface for L3 (IPv6/IPv4) based VLANs and TAP +.sp +.BR lowpan +- Interface for 6LoWPAN (IPv6) over IEEE 802.15.4 / Bluetooth +.sp +.BR macsec +- Interface for IEEE 802.1AE MAC Security (MACsec) +.sp +.B macvlan +- Virtual interface base on link layer address (MAC) +.sp +.B macvtap +- Virtual interface based on link layer address (MAC) and TAP. +.sp +.BR netdevsim +- Interface for netdev API tests +.sp +.BR netkit +- BPF-programmable network device +.sp +.BR nlmon +- Netlink monitoring device +.sp +.BR rmnet +- Qualcomm rmnet device +.sp +.BR sit +- Virtual tunnel interface IPv6 over IPv4 +.sp +.B vcan +- Virtual Controller Area Network interface +.sp +.B veth +- Virtual ethernet interface +.sp +.BR virt_wifi +- rtnetlink wifi simulation device +.sp +.BR vlan +- 802.1q tagged virtual LAN interface +.sp +.BR vrf +- Interface for L3 VRF domains +.sp +.BR vti +- Virtual tunnel interface +.sp +.B vxcan +- Virtual Controller Area Network tunnel interface +.sp +.BR vxlan +- Virtual eXtended LAN +.sp +.BR xfrm +- Virtual xfrm interface +.sp +.in -8 + +.TP +.BI numtxqueues " QUEUE_COUNT " +specifies the number of transmit queues for new device. + +.TP +.BI numrxqueues " QUEUE_COUNT " +specifies the number of receive queues for new device. + +.TP +.BI gso_max_size " BYTES " +specifies the recommended maximum size of a Generic Segment Offload +packet the new device should accept. This is also used to enable BIG +TCP for IPv6 on this device when the size is greater than 65536. + +.TP +.BI gso_ipv4_max_size " BYTES " +specifies the recommended maximum size of a IPv4 Generic Segment Offload +packet the new device should accept. This is especially used to enable +BIG TCP for IPv4 on this device by setting to a size greater than 65536. +Note that +.B gso_max_size +needs to be set to a size greater than or equal to +.B gso_ipv4_max_size +to really enable BIG TCP for IPv4. + +.TP +.BI gso_max_segs " SEGMENTS " +specifies the recommended maximum number of a Generic Segment Offload +segments the new device should accept. + +.TP +.BI gro_max_size " BYTES " +specifies the maximum size of a packet built by GRO stack on this +device. This is also used for BIG TCP to allow the size of a +merged IPv6 GSO packet on this device greater than 65536. + +.TP +.BI gro_ipv4_max_size " BYTES " +specifies the maximum size of a IPv4 packet built by GRO stack on this +device. This is especially used for BIG TCP to allow the size of a +merged IPv4 GSO packet on this device greater than 65536. + +.TP +.BI index " IDX " +specifies the desired index of the new virtual device. The link +creation fails, if the index is busy. + +.TP +.B netns +.RI "{ " PID " | " NETNSNAME " | " NETNSFILE " }" +.br +create the device in the network namespace associated with process +.IR "PID " or +the name +.IR "NETNSNAME " or +the file +.IR "NETNSFILE". + +.TP +VLAN Type Support +For a link of type +.I VLAN +the following additional arguments are supported: + +.BI "ip link add +.BI link " DEVICE " +.BI name " NAME " +.B "type vlan" +[ +.BI protocol " VLAN_PROTO " +] +.BI id " VLANID " +[ +.BR reorder_hdr " { " on " | " off " } " +] +[ +.BR gvrp " { " on " | " off " } " +] +[ +.BR mvrp " { " on " | " off " } " +] +[ +.BR loose_binding " { " on " | " off " } " +] +[ +.BR bridge_binding " { " on " | " off " } " +] +[ +.BI ingress-qos-map " QOS-MAP " +] +[ +.BI egress-qos-map " QOS-MAP " +] + +.in +8 +.sp +.BI protocol " VLAN_PROTO " +- either 802.1Q or 802.1ad. + +.BI id " VLANID " +- specifies the VLAN Identifier to use. Note that numbers with a leading " 0 " or " 0x " are interpreted as octal or hexadecimal, respectively. + +.BR reorder_hdr " { " on " | " off " } " +- specifies whether ethernet headers are reordered or not (default is +.BR on ")." + +.in +4 +If +.BR reorder_hdr " is " on +then VLAN header will be not inserted immediately but only before +passing to the physical device (if this device does not support VLAN +offloading), the similar on the RX direction - by default the packet +will be untagged before being received by VLAN device. Reordering +allows one to accelerate tagging on egress and to hide VLAN header on +ingress so the packet looks like regular Ethernet packet, at the same +time it might be confusing for packet capture as the VLAN header does +not exist within the packet. + +VLAN offloading can be checked by +.BR ethtool "(8):" +.in +4 +.sp +.B ethtool -k +<phy_dev> | +.RB grep " tx-vlan-offload" +.sp +.in -4 +where <phy_dev> is the physical device to which VLAN device is bound. +.in -4 + +.BR gvrp " { " on " | " off " } " +- specifies whether this VLAN should be registered using GARP VLAN +Registration Protocol. + +.BR mvrp " { " on " | " off " } " +- specifies whether this VLAN should be registered using Multiple VLAN +Registration Protocol. + +.BR loose_binding " { " on " | " off " } " +- specifies whether the VLAN device state is bound to the physical device state. + +.BR bridge_binding " { " on " | " off " } " +- specifies whether the VLAN device link state tracks the state of bridge ports +that are members of the VLAN. + +.BI ingress-qos-map " QOS-MAP " +- defines a mapping of VLAN header prio field to the Linux internal packet +priority on incoming frames. The format is FROM:TO with multiple mappings +separated by spaces. + +.BI egress-qos-map " QOS-MAP " +- defines a mapping of Linux internal packet priority to VLAN header prio field +but for outgoing frames. The format is the same as for ingress-qos-map. +.in +4 + +Linux packet priority can be set by +.BR iptables "(8)": +.in +4 +.sp +.B iptables +-t mangle -A POSTROUTING [...] -j CLASSIFY --set-class 0:4 +.sp +.in -4 +and this "4" priority can be used in the egress qos mapping to set +VLAN prio "5": +.sp +.in +4 +.B ip +link set veth0.10 type vlan egress 4:5 +.in -4 +.in -4 +.in -8 + +.TP +VXLAN Type Support +For a link of type +.I VXLAN +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BI type " vxlan " id " VNI" +[ +.BI dev " PHYS_DEV " +.RB " ] [ { " group " | " remote " } " +.I IPADDR +] [ +.B local +.RI "{ "IPADDR " | "any " } " +] [ +.BI ttl " TTL " +] [ +.BI tos " TOS " +] [ +.BI df " DF " +] [ +.BI flowlabel " FLOWLABEL " +] [ +.BI dstport " PORT " +] [ +.BI srcport " MIN MAX " +] [ +.RB [ no ] learning +] [ +.RB [ no ] proxy +] [ +.RB [ no ] rsc +] [ +.RB [ no ] l2miss +] [ +.RB [ no ] l3miss +] [ +.RB [ no ] udpcsum +] [ +.RB [ no ] udp6zerocsumtx +] [ +.RB [ no ] udp6zerocsumrx +] [ +.RB [ no ] localbypass +] [ +.BI ageing " SECONDS " +] [ +.BI maxaddress " NUMBER " +] [ +.RB [ no ] external +] [ +.B gbp +] [ +.B gpe +] [ +.RB [ no ] vnifilter +] + +.in +8 +.sp +.BI id " VNI " +- specifies the VXLAN Network Identifier (or VXLAN Segment +Identifier) to use. + +.BI dev " PHYS_DEV" +- specifies the physical device to use for tunnel endpoint communication. + +.sp +.BI group " IPADDR" +- specifies the multicast IP address to join. +This parameter cannot be specified with the +.B remote +parameter. + +.sp +.BI remote " IPADDR" +- specifies the unicast destination IP address to use in outgoing packets +when the destination link layer address is not known in the VXLAN device +forwarding database. This parameter cannot be specified with the +.B group +parameter. + +.sp +.BI local " IPADDR" +- specifies the source IP address to use in outgoing packets. + +.sp +.BI ttl " TTL" +- specifies the TTL value to use in outgoing packets. + +.sp +.BI tos " TOS" +- specifies the TOS value to use in outgoing packets. + +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + +.sp +.BI flowlabel " FLOWLABEL" +- specifies the flow label to use in outgoing packets. + +.sp +.BI dstport " PORT" +- specifies the UDP destination port to communicate to the remote + VXLAN tunnel endpoint. + +.sp +.BI srcport " MIN MAX" +- specifies the range of port numbers to use as UDP +source ports to communicate to the remote VXLAN tunnel endpoint. + +.sp +.RB [ no ] learning +- specifies if unknown source link layer addresses and IP addresses +are entered into the VXLAN device forwarding database. + +.sp +.RB [ no ] rsc +- specifies if route short circuit is turned on. + +.sp +.RB [ no ] proxy +- specifies ARP proxy is turned on. + +.sp +.RB [ no ] l2miss +- specifies if netlink LLADDR miss notifications are generated. + +.sp +.RB [ no ] l3miss +- specifies if netlink IP ADDR miss notifications are generated. + +.sp +.RB [ no ] udpcsum +- specifies if UDP checksum is calculated for transmitted packets over IPv4. + +.sp +.RB [ no ] udp6zerocsumtx +- skip UDP checksum calculation for transmitted packets over IPv6. + +.sp +.RB [ no ] udp6zerocsumrx +- allow incoming UDP packets over IPv6 with zero checksum field. + +.sp +.RB [ no ] localbypass +- if FDB destination is local, with nolocalbypass set, forward encapsulated +packets to the userspace network stack. If there is a userspace process +listening for these packets, it will have a chance to process them. If +localbypass is active (default), bypass the kernel network stack and +inject the packets into the target VXLAN device, assuming one exists. + +.sp +.BI ageing " SECONDS" +- specifies the lifetime in seconds of FDB entries learnt by the kernel. + +.sp +.BI maxaddress " NUMBER" +- specifies the maximum number of FDB entries. + +.sp +.RB [ no ] external +- specifies whether an external control plane +.RB "(e.g. " "ip route encap" ) +or the internal FDB should be used. + +.sp +.RB [ no ] vnifilter +- specifies whether the vxlan device is capable of vni filtering. Only works with a vxlan +device with external flag set. once enabled, bridge vni command is used to manage the +vni filtering table on the device. The device can only receive packets with vni's configured +in the vni filtering table. + +.sp +.B gbp +- enables the Group Policy extension (VXLAN-GBP). + +.in +4 +Allows one to transport group policy context across VXLAN network peers. +If enabled, includes the mark of a packet in the VXLAN header for outgoing +packets and fills the packet mark based on the information found in the +VXLAN header for incoming packets. + +Format of upper 16 bits of packet mark (flags); + +.in +2 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +.br +|-|-|-|-|-|-|-|-|-|D|-|-|A|-|-|-| +.br ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +.B D := +Don't Learn bit. When set, this bit indicates that the egress +VTEP MUST NOT learn the source address of the encapsulated frame. + +.B A := +Indicates that the group policy has already been applied to +this packet. Policies MUST NOT be applied by devices when the A bit is set. +.in -2 + +Format of lower 16 bits of packet mark (policy ID): + +.in +2 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +.br +| Group Policy ID | +.br ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +.in -2 + +Example: + iptables -A OUTPUT [...] -j MARK --set-mark 0x800FF + +.in -4 + +.sp +.B gpe +- enables the Generic Protocol extension (VXLAN-GPE). Currently, this is +only supported together with the +.B external +keyword. + +.in -8 + +.TP +VETH, VXCAN Type Support +For a link of types +.I VETH/VXCAN +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " veth " | " vxcan " }" +[ +.BR peer +.BI "name " NAME +] + +.in +8 +.sp +.BR peer +.BI "name " NAME +- specifies the virtual pair device name of the +.I VETH/VXCAN +tunnel. + +.in -8 + +.TP +netkit Type Support +For a link of type +.I netkit +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " netkit " +[ +.BI mode " MODE " +] [ +.I "POLICY " +] [ +.BR peer +[ +.I "POLICY " +] [ +.I "NAME " +] ] + +.in +8 + +.sp +.BI mode " MODE" +- specifies the operation mode of the netkit device with "l3" and "l2" +as possible values. Default option is "l3". + +.sp +.I "POLICY" +- specifies the default device policy when no BPF programs are attached +with "forward" and "blackhole" as possible values. Default option is +"forward". Specifying policy before the peer option refers to the primary +device, after the peer option refers to the peer device. + +.sp +.I "NAME" +- specifies the device name of the peer device. + +.in -8 + +.TP +IPIP, SIT Type Support +For a link of type +.IR IPIP or SIT +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " ipip " | " sit " }" +.BI " remote " ADDR " local " ADDR +[ +.BR encap " { " fou " | " gue " | " none " }" +] [ +.BR encap-sport " { " \fIPORT " | " auto " }" +] [ +.BI "encap-dport " PORT +] [ +.RB [ no ] encap-csum +] [ +.I " [no]encap-remcsum " +] [ +.I " mode " { ip6ip | ipip | mplsip | any } " +] [ +.BR external +] + +.in +8 +.sp +.BI remote " ADDR " +- specifies the remote address of the tunnel. + +.sp +.BI local " ADDR " +- specifies the fixed local address for tunneled packets. +It must be an address on another interface on this host. + +.sp +.BR encap " { " fou " | " gue " | " none " }" +- specifies type of secondary UDP encapsulation. "fou" indicates +Foo-Over-UDP, "gue" indicates Generic UDP Encapsulation. + +.sp +.BR encap-sport " { " \fIPORT " | " auto " }" +- specifies the source port in UDP encapsulation. +.IR PORT +indicates the port by number, "auto" +indicates that the port number should be chosen automatically +(the kernel picks a flow based on the flow hash of the +encapsulated packet). + +.sp +.RB [ no ] encap-csum +- specifies if UDP checksums are enabled in the secondary +encapsulation. + +.sp +.RB [ no ] encap-remcsum +- specifies if Remote Checksum Offload is enabled. This is only +applicable for Generic UDP Encapsulation. + +.sp +.BI mode " { ip6ip | ipip | mplsip | any } " +- specifies mode in which device should run. "ip6ip" indicates +IPv6-Over-IPv4, "ipip" indicates "IPv4-Over-IPv4", "mplsip" indicates +MPLS-Over-IPv4, "any" indicates IPv6, IPv4 or MPLS Over IPv4. Supported for +SIT where the default is "ip6ip" and IPIP where the default is "ipip". +IPv6-Over-IPv4 is not supported for IPIP. + +.sp +.BR external +- make this tunnel externally controlled +.RB "(e.g. " "ip route encap" ). + +.in -8 +.TP +GRE Type Support +For a link of type +.IR GRE " or " GRETAP +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " gre " | " gretap " }" +.BI " remote " ADDR " local " ADDR +[ +.RB [ no ] "" [ i | o ] seq +] [ +.RB [ i | o ] key +.I KEY +| +.BR no [ i | o ] key +] [ +.RB [ no ] "" [ i | o ] csum +] [ +.BI ttl " TTL " +] [ +.BI tos " TOS " +] [ +.RB [ no ] pmtudisc +] [ +.RB [ no ] ignore-df +] [ +.BI dev " PHYS_DEV " +] [ +.BR encap " { " fou " | " gue " | " none " }" +] [ +.BR encap-sport " { " \fIPORT " | " auto " }" +] [ +.BI "encap-dport " PORT +] [ +.RB [ no ] encap-csum +] [ +.RB [ no ] encap-remcsum +] [ +.BR external +] + +.in +8 +.sp +.BI remote " ADDR " +- specifies the remote address of the tunnel. + +.sp +.BI local " ADDR " +- specifies the fixed local address for tunneled packets. +It must be an address on another interface on this host. + +.sp +.RB [ no ] "" [ i | o ] seq +- serialize packets. +The +.B oseq +flag enables sequencing of outgoing packets. +The +.B iseq +flag requires that all input packets are serialized. + +.sp +.RB [ i | o ] key +.I KEY +| +.BR no [ i | o ] key +- use keyed GRE with key +.IR KEY ". "KEY +is either a number or an IPv4 address-like dotted quad. +The +.B key +parameter specifies the same key to use in both directions. +The +.BR ikey " and " okey +parameters specify different keys for input and output. + +.sp +.RB [ no ] "" [ i | o ] csum +- generate/require checksums for tunneled packets. +The +.B ocsum +flag calculates checksums for outgoing packets. +The +.B icsum +flag requires that all input packets have the correct +checksum. The +.B csum +flag is equivalent to the combination +.B "icsum ocsum" . + +.sp +.BI ttl " TTL" +- specifies the TTL value to use in outgoing packets. + +.sp +.BI tos " TOS" +- specifies the TOS value to use in outgoing packets. + +.sp +.RB [ no ] pmtudisc +- enables/disables Path MTU Discovery on this tunnel. +It is enabled by default. Note that a fixed ttl is incompatible +with this option: tunneling with a fixed ttl always makes pmtu +discovery. + +.sp +.RB [ no ] ignore-df +- enables/disables IPv4 DF suppression on this tunnel. +Normally datagrams that exceed the MTU will be fragmented; the presence +of the DF flag inhibits this, resulting instead in an ICMP Unreachable +(Fragmentation Required) message. Enabling this attribute causes the +DF flag to be ignored. + +.sp +.BI dev " PHYS_DEV" +- specifies the physical device to use for tunnel endpoint communication. + +.sp +.BR encap " { " fou " | " gue " | " none " }" +- specifies type of secondary UDP encapsulation. "fou" indicates +Foo-Over-UDP, "gue" indicates Generic UDP Encapsulation. + +.sp +.BR encap-sport " { " \fIPORT " | " auto " }" +- specifies the source port in UDP encapsulation. +.IR PORT +indicates the port by number, "auto" +indicates that the port number should be chosen automatically +(the kernel picks a flow based on the flow hash of the +encapsulated packet). + +.sp +.RB [ no ] encap-csum +- specifies if UDP checksums are enabled in the secondary +encapsulation. + +.sp +.RB [ no ] encap-remcsum +- specifies if Remote Checksum Offload is enabled. This is only +applicable for Generic UDP Encapsulation. + +.sp +.BR external +- make this tunnel externally controlled +.RB "(e.g. " "ip route encap" ). + +.in -8 + +.TP +IP6GRE/IP6GRETAP Type Support +For a link of type +.I IP6GRE/IP6GRETAP +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " ip6gre " | " ip6gretap " }" +.BI remote " ADDR " local " ADDR" +[ +.RB [ no ] "" [ i | o ] seq +] [ +.RB [ i | o ] key +.I KEY +| +.BR no [ i | o ] key +] [ +.RB [ no ] "" [ i | o ] csum +] [ +.BI hoplimit " TTL " +] [ +.BI encaplimit " ELIM " +] [ +.BI tclass " TCLASS " +] [ +.BI flowlabel " FLOWLABEL " +] [ +.BI "dscp inherit" +] [ +.BI "[no]allow-localremote" +] [ +.BI dev " PHYS_DEV " +] [ +.RB external +] + +.in +8 +.sp +.BI remote " ADDR " +- specifies the remote IPv6 address of the tunnel. + +.sp +.BI local " ADDR " +- specifies the fixed local IPv6 address for tunneled packets. +It must be an address on another interface on this host. + +.sp +.RB [ no ] "" [ i | o ] seq +- serialize packets. +The +.B oseq +flag enables sequencing of outgoing packets. +The +.B iseq +flag requires that all input packets are serialized. + +.sp +.RB [ i | o ] key +.I KEY +| +.BR no [ i | o ] key +- use keyed GRE with key +.IR KEY ". "KEY +is either a number or an IPv4 address-like dotted quad. +The +.B key +parameter specifies the same key to use in both directions. +The +.BR ikey " and " okey +parameters specify different keys for input and output. + +.sp +.RB [ no ] "" [ i | o ] csum +- generate/require checksums for tunneled packets. +The +.B ocsum +flag calculates checksums for outgoing packets. +The +.B icsum +flag requires that all input packets have the correct +checksum. The +.B csum +flag is equivalent to the combination +.BR "icsum ocsum" . + +.sp +.BI hoplimit " TTL" +- specifies Hop Limit value to use in outgoing packets. + +.sp +.BI encaplimit " ELIM" +- specifies a fixed encapsulation limit. Default is 4. + +.sp +.BI flowlabel " FLOWLABEL" +- specifies a fixed flowlabel. + +.sp +.BI [no]allow-localremote +- specifies whether to allow remote endpoint to have an address configured on +local host. + +.sp +.BI tclass " TCLASS" +- specifies the traffic class field on +tunneled packets, which can be specified as either a two-digit +hex value (e.g. c0) or a predefined string (e.g. internet). +The value +.B inherit +causes the field to be copied from the original IP header. The +values +.BI "inherit/" STRING +or +.BI "inherit/" 00 ".." ff +will set the field to +.I STRING +or +.IR 00 ".." ff +when tunneling non-IP packets. The default value is 00. + +.sp +.RB external +- make this tunnel externally controlled (or not, which is the default). +In the kernel, this is referred to as collect metadata mode. This flag is +mutually exclusive with the +.BR remote , +.BR local , +.BR seq , +.BR key, +.BR csum, +.BR hoplimit, +.BR encaplimit, +.BR flowlabel " and " tclass +options. + +.in -8 + +.TP +IPoIB Type Support +For a link of type +.I IPoIB +the following additional arguments are supported: + +.BI "ip link add " DEVICE " name " NAME +.BR "type ipoib " [ " pkey \fIPKEY" " ] [ " mode " \fIMODE \fR]" + +.in +8 +.sp +.BI pkey " PKEY " +- specifies the IB P-Key to use. + +.BI mode " MODE " +- specifies the mode (datagram or connected) to use. + +.TP +ERSPAN Type Support +For a link of type +.I ERSPAN/IP6ERSPAN +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " erspan " | " ip6erspan " }" +.BI remote " ADDR " local " ADDR " seq +.RB key +.I KEY +.BR erspan_ver " \fIversion " +[ +.BR erspan " \fIIDX " +] [ +.BR erspan_dir " { " \fIingress " | " \fIegress " }" +] [ +.BR erspan_hwid " \fIhwid " +] [ +.BI "[no]allow-localremote" +] [ +.RB external +] + +.in +8 +.sp +.BI remote " ADDR " +- specifies the remote address of the tunnel. + +.sp +.BI local " ADDR " +- specifies the fixed local address for tunneled packets. +It must be an address on another interface on this host. + +.sp +.BR erspan_ver " \fIversion " +- specifies the ERSPAN version number. +.IR version +indicates the ERSPAN version to be created: 0 for version 0 type I, +1 for version 1 (type II) or 2 for version 2 (type III). + +.sp +.BR erspan " \fIIDX " +- specifies the ERSPAN v1 index field. +.IR IDX +indicates a 20 bit index/port number associated with the ERSPAN +traffic's source port and direction. + +.sp +.BR erspan_dir " { " \fIingress " | " \fIegress " }" +- specifies the ERSPAN v2 mirrored traffic's direction. + +.sp +.BR erspan_hwid " \fIhwid " +- an unique identifier of an ERSPAN v2 engine within a system. +.IR hwid +is a 6-bit value for users to configure. + +.sp +.BI [no]allow-localremote +- specifies whether to allow remote endpoint to have an address configured on +local host. + +.sp +.BR external +- make this tunnel externally controlled (or not, which is the default). +In the kernel, this is referred to as collect metadata mode. This flag is +mutually exclusive with the +.BR remote , +.BR local , +.BR erspan_ver , +.BR erspan , +.BR erspan_dir " and " erspan_hwid +options. + +.in -8 + +.TP +GENEVE Type Support +For a link of type +.I GENEVE +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BI type " geneve " id " VNI " remote " IPADDR" +[ +.BI ttl " TTL " +] [ +.BI tos " TOS " +] [ +.BI df " DF " +] [ +.BI flowlabel " FLOWLABEL " +] [ +.BI dstport " PORT" +] [ +.RB [ no ] external +] [ +.RB [ no ] udpcsum +] [ +.RB [ no ] udp6zerocsumtx +] [ +.RB [ no ] udp6zerocsumrx +] [ +.B innerprotoinherit +] + +.in +8 +.sp +.BI id " VNI " +- specifies the Virtual Network Identifier to use. + +.sp +.BI remote " IPADDR" +- specifies the unicast destination IP address to use in outgoing packets. + +.sp +.BI ttl " TTL" +- specifies the TTL value to use in outgoing packets. "0" or "auto" means +use whatever default value, "inherit" means inherit the inner protocol's +ttl. Default option is "0". + +.sp +.BI tos " TOS" +- specifies the TOS value to use in outgoing packets. + +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + +.sp +.BI flowlabel " FLOWLABEL" +- specifies the flow label to use in outgoing packets. + +.sp +.BI dstport " PORT" +- select a destination port other than the default of 6081. + +.sp +.RB [ no ] external +- make this tunnel externally controlled (or not, which is the default). This +flag is mutually exclusive with the +.BR id , +.BR remote , +.BR ttl , +.BR tos " and " flowlabel +options. + +.sp +.RB [ no ] udpcsum +- specifies if UDP checksum is calculated for transmitted packets over IPv4. + +.sp +.RB [ no ] udp6zerocsumtx +- skip UDP checksum calculation for transmitted packets over IPv6. + +.sp +.RB [ no ] udp6zerocsumrx +- allow incoming UDP packets over IPv6 with zero checksum field. + +.sp +.B innerprotoinherit +- use IPv4/IPv6 as inner protocol instead of Ethernet. + +.in -8 + +.TP +Bareudp Type Support +For a link of type +.I Bareudp +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BI type " bareudp " dstport " PORT " ethertype " PROTO" +[ +.BI srcportmin " PORT " +] [ +.RB [ no ] multiproto +] + +.in +8 +.sp +.BI dstport " PORT" +- specifies the destination port for the UDP tunnel. + +.sp +.BI ethertype " PROTO" +- specifies the ethertype of the L3 protocol being tunnelled. +.B ethertype +can be given as plain Ethernet protocol number or using the protocol name +("ipv4", "ipv6", "mpls_uc", etc.). + +.sp +.BI srcportmin " PORT" +- selects the lowest value of the UDP tunnel source port range. + +.sp +.RB [ no ] multiproto +- activates support for protocols similar to the one +.RB "specified by " ethertype . +When +.B ethertype +is "mpls_uc" (that is, unicast MPLS), this allows the tunnel to also handle +multicast MPLS. +When +.B ethertype +is "ipv4", this allows the tunnel to also handle IPv6. This option is disabled +by default. + +.TP +AMT Type Support +For a link of type +.I AMT +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BI type " AMT " discovery " IPADDR " mode " { " gateway " | " relay " } " +.BI local " IPADDR " dev " PHYS_DEV " [ +.BI relay_port " PORT " ] +[ +.BI gateway_port " PORT " ] +[ +.BI max_tunnels " NUMBER " +] + +.in +8 +.sp +.BI discovery " IPADDR" +- specifies the unicast discovery IP address to use to find remote IP address. + +.BR mode " { " gateway " | " relay " } " +- specifies the role of AMT, Gateway or Relay + +.BI local " IPADDR " +- specifies the source IP address to use in outgoing packets. + +.BI dev " PHYS_DEV " +- specifies the underlying physical interface from which transform traffic +is sent and received. + +.BI relay_port " PORT " +- specifies the UDP Relay port to communicate to the Relay. + +.BI gateway_port " PORT " +- specifies the UDP Gateway port to communicate to the Gateway. + +.BI max_tunnels " NUMBER " +- specifies the maximum number of tunnels. + +.in -8 + +.TP +MACVLAN and MACVTAP Type Support +For a link of type +.I MACVLAN +or +.I MACVTAP +the following additional arguments are supported: + +.BI "ip link add link " DEVICE " name " NAME +.BR type " { " macvlan " | " macvtap " } " +.BR mode " { " private " | " vepa " | " bridge " | " passthru +.RB " [ " nopromisc " ] | " source " [ " nodst " ] } " +.RB " [ " bcqueuelen " { " LENGTH " } ] " +.RB " [ " bclim " " LIMIT " ] " + +.in +8 +.sp +.BR type " { " macvlan " | " macvtap " } " +- specifies the link type to use. +.BR macvlan " creates just a virtual interface, while " +.BR macvtap " in addition creates a character device " +.BR /dev/tapX " to be used just like a " tuntap " device." + +.B mode private +- Do not allow communication between +.B macvlan +instances on the same physical interface, even if the external switch supports +hairpin mode. + +.B mode vepa +- Virtual Ethernet Port Aggregator mode. Data from one +.B macvlan +instance to the other on the same physical interface is transmitted over the +physical interface. Either the attached switch needs to support hairpin mode, +or there must be a TCP/IP router forwarding the packets in order to allow +communication. This is the default mode. + +.B mode bridge +- In bridge mode, all endpoints are directly connected to each other, +communication is not redirected through the physical interface's peer. + +.BR mode " " passthru " [ " nopromisc " ] " +- This mode gives more power to a single endpoint, usually in +.BR macvtap " mode. It is not allowed for more than one endpoint on the same " +physical interface. All traffic will be forwarded to this endpoint, allowing +virtio guests to change MAC address or set promiscuous mode in order to bridge +the interface or create vlan interfaces on top of it. By default, this mode +forces the underlying interface into promiscuous mode. Passing the +.BR nopromisc " flag prevents this, so the promisc flag may be controlled " +using standard tools. + +.BR mode " " source " [ " nodst " ] " +- allows one to set a list of allowed mac address, which is used to match +against source mac address from received frames on underlying interface. This +allows creating mac based VLAN associations, instead of standard port or tag +based. The feature is useful to deploy 802.1x mac based behavior, +where drivers of underlying interfaces doesn't allows that. By default, packets +are also considered (duplicated) for destination-based MACVLAN. Passing the +.BR nodst " flag stops matching packets from also going through the " +destination-based flow. + +.BR bcqueuelen " { " LENGTH " } " +- Set the length of the RX queue used to process broadcast and multicast packets. +.BR LENGTH " must be a positive integer in the range [0-4294967295]." +Setting a length of 0 will effectively drop all broadcast/multicast traffic. +If not specified the macvlan driver default (1000) is used. +Note that all macvlans that share the same underlying device are using the same +.RB "queue. The parameter here is a " request ", the actual queue length used" +will be the maximum length that any macvlan interface has requested. +When listing device parameters both the bcqueuelen parameter +as well as the actual used bcqueuelen are listed to better help +the user understand the setting. + +.BR bclim " " LIMIT +- Set the threshold for broadcast queueing. +.BR LIMIT " must be a 32-bit integer." +Setting this to -1 disables broadcast queueing altogether. Otherwise +a multicast address will be queued as broadcast if the number of devices +using it is greater than the given value. +.in -8 + +.TP +High-availability Seamless Redundancy (HSR) Support +For a link of type +.I HSR +the following additional arguments are supported: + +.BI "ip link add link " DEVICE " name " NAME " type hsr" +.BI slave1 " SLAVE1-IF " slave2 " SLAVE2-IF " +.RB [ " supervision" +.IR ADDR-BYTE " ] [" +.BR version " { " 0 " | " 1 " } [" +.BR proto " { " 0 " | " 1 " } ]" + +.in +8 +.sp +.BR type " hsr " +- specifies the link type to use, here HSR. + +.BI slave1 " SLAVE1-IF " +- Specifies the physical device used for the first of the two ring ports. + +.BI slave2 " SLAVE2-IF " +- Specifies the physical device used for the second of the two ring ports. + +.BI supervision " ADDR-BYTE" +- The last byte of the multicast address used for HSR supervision frames. +Default option is "0", possible values 0-255. + +.BR version " { " 0 " | " 1 " }" +- Selects the protocol version of the interface. Default option is "0", which +corresponds to the 2010 version of the HSR standard. Option "1" activates the +2012 version. + +.BR proto " { " 0 " | " 1 " }" +- Selects the protocol at the interface. Default option is "0", which +corresponds to the HSR standard. Option "1" activates the Parallel +Redundancy Protocol (PRP). +. +.in -8 + +.TP +BRIDGE Type Support +For a link of type +.I BRIDGE +the following additional arguments are supported: + +.BI "ip link add " DEVICE " type bridge " +[ +.BI ageing_time " AGEING_TIME " +] [ +.BI group_fwd_mask " MASK " +] [ +.BI group_address " ADDRESS " +] [ +.BI forward_delay " FORWARD_DELAY " +] [ +.BI hello_time " HELLO_TIME " +] [ +.BI max_age " MAX_AGE " +] [ +.BI stp_state " STP_STATE " +] [ +.BI priority " PRIORITY " +] [ +.BI no_linklocal_learn " NO_LINKLOCAL_LEARN " +] [ +.BI fdb_max_learned " FDB_MAX_LEARNED " +] [ +.BI vlan_filtering " VLAN_FILTERING " +] [ +.BI vlan_protocol " VLAN_PROTOCOL " +] [ +.BI vlan_default_pvid " VLAN_DEFAULT_PVID " +] [ +.BI vlan_stats_enabled " VLAN_STATS_ENABLED " +] [ +.BI vlan_stats_per_port " VLAN_STATS_PER_PORT " +] [ +.BI mcast_snooping " MULTICAST_SNOOPING " +] [ +.BI mcast_vlan_snooping " MULTICAST_VLAN_SNOOPING " +] [ +.BI mcast_router " MULTICAST_ROUTER " +] [ +.BI mcast_query_use_ifaddr " MCAST_QUERY_USE_IFADDR " +] [ +.BI mcast_querier " MULTICAST_QUERIER " +] [ +.BI mcast_hash_elasticity " HASH_ELASTICITY " +] [ +.BI mcast_hash_max " HASH_MAX " +] [ +.BI mcast_last_member_count " LAST_MEMBER_COUNT " +] [ +.BI mcast_startup_query_count " STARTUP_QUERY_COUNT " +] [ +.BI mcast_last_member_interval " LAST_MEMBER_INTERVAL " +] [ +.BI mcast_membership_interval " MEMBERSHIP_INTERVAL " +] [ +.BI mcast_querier_interval " QUERIER_INTERVAL " +] [ +.BI mcast_query_interval " QUERY_INTERVAL " +] [ +.BI mcast_query_response_interval " QUERY_RESPONSE_INTERVAL " +] [ +.BI mcast_startup_query_interval " STARTUP_QUERY_INTERVAL " +] [ +.BI mcast_stats_enabled " MCAST_STATS_ENABLED " +] [ +.BI mcast_igmp_version " IGMP_VERSION " +] [ +.BI mcast_mld_version " MLD_VERSION " +] [ +.BI nf_call_iptables " NF_CALL_IPTABLES " +] [ +.BI nf_call_ip6tables " NF_CALL_IP6TABLES " +] [ +.BI nf_call_arptables " NF_CALL_ARPTABLES " +] + +.in +8 +.sp +.BI ageing_time " AGEING_TIME " +- configure the bridge's FDB entries ageing time, ie the number of +seconds a MAC address will be kept in the FDB after a packet has been +received from that address. after this time has passed, entries are +cleaned up. + +.BI group_fwd_mask " MASK " +- set the group forward mask. This is the bitmask that is applied to +decide whether to forward incoming frames destined to link-local +addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, +ie the bridge does not forward any link-local frames). + +.BI group_address " ADDRESS " +- set the MAC address of the multicast group this bridge uses for STP. +The address must be a link-local address in standard Ethernet MAC +address format, ie an address of the form 01:80:C2:00:00:0X, with X + in [0, 4..f]. + +.BI forward_delay " FORWARD_DELAY " +- set the forwarding delay in seconds, ie the time spent in LISTENING +state (before moving to LEARNING) and in LEARNING state (before +moving to FORWARDING). Only relevant if STP is enabled. Valid values +are between 2 and 30. + +.BI hello_time " HELLO_TIME " +- set the time in seconds between hello packets sent by the bridge, +when it is a root bridge or a designated bridges. +Only relevant if STP is enabled. Valid values are between 1 and 10. + +.BI max_age " MAX_AGE " +- set the hello packet timeout, ie the time in seconds until another +bridge in the spanning tree is assumed to be dead, after reception of +its last hello message. Only relevant if STP is enabled. Valid values +are between 6 and 40. + +.BI stp_state " STP_STATE " +- turn spanning tree protocol on +.RI ( STP_STATE " > 0) " +or off +.RI ( STP_STATE " == 0). " +for this bridge. + +.BI priority " PRIORITY " +- set this bridge's spanning tree priority, used during STP root +bridge election. +.I PRIORITY +is a 16bit unsigned integer. + +.BI no_linklocal_learn " NO_LINKLOCAL_LEARN " +- turn link-local learning on +.RI ( NO_LINKLOCAL_LEARN " == 0) " +or off +.RI ( NO_LINKLOCAL_LEARN " > 0). " +When disabled, the bridge will not learn from link-local frames (default: +enabled). + +.BI fdb_max_learned " FDB_MAX_LEARNED " +- set the maximum number of learned FDB entries. If +.RI ( FDB_MAX_LEARNED " == 0) " +the feature is disabled. Default is +.BR 0 . +.I FDB_MAX_LEARNED +is a 32bit unsigned integer. + +.BI vlan_filtering " VLAN_FILTERING " +- turn VLAN filtering on +.RI ( VLAN_FILTERING " > 0) " +or off +.RI ( VLAN_FILTERING " == 0). " +When disabled, the bridge will not consider the VLAN tag when handling packets. + +.BR vlan_protocol " { " 802.1Q " | " 802.1ad " } " +- set the protocol used for VLAN filtering. + +.BI vlan_default_pvid " VLAN_DEFAULT_PVID " +- set the default PVID (native/untagged VLAN ID) for this bridge. + +.BI vlan_stats_enabled " VLAN_STATS_ENABLED " +- enable +.RI ( VLAN_STATS_ENABLED " == 1) " +or disable +.RI ( VLAN_STATS_ENABLED " == 0) " +per-VLAN stats accounting. + +.BI vlan_stats_per_port " VLAN_STATS_PER_PORT " +- enable +.RI ( VLAN_STATS_PER_PORT " == 1) " +or disable +.RI ( VLAN_STATS_PER_PORT " == 0) " +per-VLAN per-port stats accounting. Can be changed only when there are no port VLANs configured. + +.BI mcast_snooping " MULTICAST_SNOOPING " +- turn multicast snooping on +.RI ( MULTICAST_SNOOPING " > 0) " +or off +.RI ( MULTICAST_SNOOPING " == 0). " + +.BI mcast_vlan_snooping " MULTICAST_VLAN_SNOOPING " +- turn multicast VLAN snooping on +.RI ( MULTICAST_VLAN_SNOOPING " > 0) " +or off +.RI ( MULTICAST_VLAN_SNOOPING " == 0). " + +.BI mcast_router " MULTICAST_ROUTER " +- set bridge's multicast router if IGMP snooping is enabled. +.I MULTICAST_ROUTER +is an integer value having the following meaning: +.in +8 +.sp +.B 0 +- disabled. + +.B 1 +- automatic (queried). + +.B 2 +- permanently enabled. +.in -8 + +.BI mcast_query_use_ifaddr " MCAST_QUERY_USE_IFADDR " +- whether to use the bridge's own IP address as source address for IGMP queries +.RI ( MCAST_QUERY_USE_IFADDR " > 0) " +or the default of 0.0.0.0 +.RI ( MCAST_QUERY_USE_IFADDR " == 0). " + +.BI mcast_querier " MULTICAST_QUERIER " +- enable +.RI ( MULTICAST_QUERIER " > 0) " +or disable +.RI ( MULTICAST_QUERIER " == 0) " +IGMP querier, ie sending of multicast queries by the bridge (default: disabled). + +.BI mcast_querier_interval " QUERIER_INTERVAL " +- interval between queries sent by other routers. if no queries are seen +after this delay has passed, the bridge will start to send its own queries +(as if +.BI mcast_querier +was enabled). + +.BI mcast_hash_elasticity " HASH_ELASTICITY " +- set multicast database hash elasticity, ie the maximum chain length +in the multicast hash table (defaults to 4). + +.BI mcast_hash_max " HASH_MAX " +- set maximum size of multicast hash table (defaults to 512, +value must be a power of 2). + +.BI mcast_last_member_count " LAST_MEMBER_COUNT " +- set multicast last member count, ie the number of queries the bridge +will send before stopping forwarding a multicast group after a "leave" +message has been received (defaults to 2). + +.BI mcast_last_member_interval " LAST_MEMBER_INTERVAL " +- interval between queries to find remaining members of a group, +after a "leave" message is received. + +.BI mcast_startup_query_count " STARTUP_QUERY_COUNT " +- set the number of IGMP queries to send during startup phase (defaults to 2). + +.BI mcast_startup_query_interval " STARTUP_QUERY_INTERVAL " +- interval between queries in the startup phase. + +.BI mcast_query_interval " QUERY_INTERVAL " +- interval between queries sent by the bridge after the end of the +startup phase. + +.BI mcast_query_response_interval " QUERY_RESPONSE_INTERVAL " +- set the Max Response Time/Maximum Response Delay for IGMP/MLD +queries sent by the bridge. + +.BI mcast_membership_interval " MEMBERSHIP_INTERVAL " +- delay after which the bridge will leave a group, +if no membership reports for this group are received. + +.BI mcast_stats_enabled " MCAST_STATS_ENABLED " +- enable +.RI ( MCAST_STATS_ENABLED " > 0) " +or disable +.RI ( MCAST_STATS_ENABLED " == 0) " +multicast (IGMP/MLD) stats accounting. + +.BI mcast_igmp_version " IGMP_VERSION " +- set the IGMP version. + +.BI mcast_mld_version " MLD_VERSION " +- set the MLD version. + +.BI nf_call_iptables " NF_CALL_IPTABLES " +- enable +.RI ( NF_CALL_IPTABLES " > 0) " +or disable +.RI ( NF_CALL_IPTABLES " == 0) " +iptables hooks on the bridge. + +.BI nf_call_ip6tables " NF_CALL_IP6TABLES " +- enable +.RI ( NF_CALL_IP6TABLES " > 0) " +or disable +.RI ( NF_CALL_IP6TABLES " == 0) " +ip6tables hooks on the bridge. + +.BI nf_call_arptables " NF_CALL_ARPTABLES " +- enable +.RI ( NF_CALL_ARPTABLES " > 0) " +or disable +.RI ( NF_CALL_ARPTABLES " == 0) " +arptables hooks on the bridge. + + +.in -8 + +.TP +MACsec Type Support +For a link of type +.I MACsec +the following additional arguments are supported: + +.BI "ip link add link " DEVICE " name " NAME " type macsec" +[ [ +.BI address " <lladdr>" +] +.BI port " PORT" +| +.BI sci " SCI" +] [ +.BI cipher " CIPHER_SUITE" +] [ +.BR icvlen " { " +.IR 8..16 " } ] [" +.BR encrypt " {" +.BR on " | " off " } ] [ " +.BR send_sci " { " on " | " off " } ] [" +.BR end_station " { " on " | " off " } ] [" +.BR scb " { " on " | " off " } ] [" +.BR protect " { " on " | " off " } ] [" +.BR replay " { " on " | " off " }" +.BR window " { " +.IR 0..2^32-1 " } ] [" +.BR validate " { " strict " | " check " | " disabled " } ] [" +.BR encodingsa " { " +.IR 0..3 " } ]" + +.in +8 +.sp +.BI address " <lladdr> " +- sets the system identifier component of secure channel for this MACsec device. + +.sp +.BI port " PORT " +- sets the port number component of secure channel for this MACsec +device, in a range from 1 to 65535 inclusive. Numbers with a leading " +0 " or " 0x " are interpreted as octal and hexadecimal, respectively. + +.sp +.BI sci " SCI " +- sets the secure channel identifier for this MACsec device. +.I SCI +is a 64bit wide number in hexadecimal format. + +.sp +.BI cipher " CIPHER_SUITE " +- defines the cipher suite to use. + +.sp +.BI icvlen " LENGTH " +- sets the length of the Integrity Check Value (ICV). + +.sp +.BR "encrypt on " or " encrypt off" +- switches between authenticated encryption, or authenticity mode only. + +.sp +.BR "send_sci on " or " send_sci off" +- specifies whether the SCI is included in every packet, +or only when it is necessary. + +.sp +.BR "end_station on " or " end_station off" +- sets the End Station bit. + +.sp +.BR "scb on " or " scb off" +- sets the Single Copy Broadcast bit. + +.sp +.BR "protect on " or " protect off" +- enables MACsec protection on the device. + +.sp +.BR "replay on " or " replay off" +- enables replay protection on the device. + +.in +8 + +.sp +.BI window " SIZE " +- sets the size of the replay window. + +.in -8 + +.sp +.BR "validate strict " or " validate check " or " validate disabled" +- sets the validation mode on the device. + +.sp +.BI encodingsa " AN " +- sets the active secure association for transmission. + +.in -8 + +.TP +VRF Type Support +For a link of type +.I VRF +the following additional arguments are supported: + +.BI "ip link add " DEVICE " type vrf table " TABLE + +.in +8 +.sp +.BR table " table id associated with VRF device" + +.in -8 + +.TP +RMNET Type Support +For a link of type +.I RMNET +the following additional arguments are supported: + +.BI "ip link add link " DEVICE " name " NAME " type rmnet mux_id " MUXID + +.in +8 +.sp +.BI mux_id " MUXID " +- specifies the mux identifier for the rmnet device, possible values 1-254. + +.in -8 + +.TP +XFRM Type Support +For a link of type +.I XFRM +the following additional arguments are supported: + +.BI "ip link add " DEVICE " type xfrm dev " PHYS_DEV " [ if_id " IF_ID " ]" +.BR "[ external ]" + +.in +8 +.sp +.BI dev " PHYS_DEV " +- specifies the underlying physical interface from which transform traffic is sent and received. + +.sp +.BI if_id " IF-ID " +- specifies the hexadecimal lookup key used to send traffic to and from specific xfrm +policies. Policies must be configured with the same key. If not set, the key defaults to +0 and will match any policies which similarly do not have a lookup key configuration. + +.sp +.BI external +- make this device externally controlled. This flag is mutually exclusive with the +.BR dev " and " if_id +options. + +.in -8 + +.TP +GTP Type Support +For a link of type +.I GTP +the following additional arguments are supported: + +.BI "ip link add " DEVICE " type gtp role " ROLE " hsize " HSIZE + +.in +8 +.sp +.BI role " ROLE " +- specifies the role of the GTP device, either sgsn or ggsn + +.sp +.BI hsize " HSIZE " +- specifies size of the hashtable which stores PDP contexts + +.sp +.BI restart_count " RESTART_COUNT " +- GTP instance restart counter + +.in -8 + +.SS ip link delete - delete virtual link + +.TP +.BI dev " DEVICE " +specifies the virtual device to act operate on. + +.TP +.BI group " GROUP " +specifies the group of virtual links to delete. Group 0 is not allowed to be +deleted since it is the default group. + +.TP +.BI type " TYPE " +specifies the type of the device. + +.SS ip link set - change device attributes + +.PP +.B Warning: +If multiple parameter changes are requested, +.B ip +aborts immediately after any of the changes have failed. +This is the only case when +.B ip +can move the system to an unpredictable state. The solution +is to avoid changing several parameters with one +.B ip link set +call. +The modifier +.B change +is equivalent to +.BR "set" . + + +.TP +.BI dev " DEVICE " +.I DEVICE +specifies network device to operate on. When configuring SR-IOV +Virtual Function (VF) devices, this keyword should specify the +associated Physical Function (PF) device. + +.TP +.BI group " GROUP " +.I GROUP +has a dual role: If both group and dev are present, then move the device to the +specified group. If only a group is specified, then the command operates on +all devices in that group. + +.TP +.BR up " and " down +change the state of the device to +.B UP +or +.BR "DOWN" . + +.TP +.BR "arp on " or " arp off" +change the +.B NOARP +flag on the device. + +.TP +.BR "multicast on " or " multicast off" +change the +.B MULTICAST +flag on the device. + +.TP +.BR "allmulticast on " or " allmulticast off" +change the +.B ALLMULTI +flag on the device. When enabled, instructs network driver to retrieve all +multicast packets from the network to the kernel for further processing. + +.TP +.BR "promisc on " or " promisc off" +change the +.B PROMISC +flag on the device. When enabled, activates promiscuous operation of the +network device. + +.TP +.BR "trailers on " or " trailers off" +change the +.B NOTRAILERS +flag on the device, +.B NOT +used by the Linux and exists for BSD compatibility. + +.TP +.BR "protodown on " or " protodown off" +change the +.B PROTODOWN +state on the device. Indicates that a protocol error has been detected +on the port. Switch drivers can react to this error by doing a phys +down on the switch port. + +.TP +.BR "protodown_reason PREASON on " or " off" +set +.B PROTODOWN +reasons on the device. protodown reason bit names can be enumerated under +/etc/iproute2/protodown_reasons.d/. possible reasons bits 0-31 + +.TP +.BR "dynamic on " or " dynamic off" +change the +.B DYNAMIC +flag on the device. Indicates that address can change when interface +goes down (currently +.B NOT +used by the Linux). + +.TP +.BI name " NAME" +change the name of the device. This operation is not +recommended if the device is running or has some addresses +already configured. + +.TP +.BI txqueuelen " NUMBER" +.TP +.BI txqlen " NUMBER" +change the transmit queue length of the device. + +.TP +.BI mtu " NUMBER" +change the +.I MTU +of the device. + +.TP +.BI address " LLADDRESS" +change the station address of the interface. + +.TP +.BI broadcast " LLADDRESS" +.TP +.BI brd " LLADDRESS" +.TP +.BI peer " LLADDRESS" +change the link layer broadcast address or the peer address when +the interface is +.IR "POINTOPOINT" . + +.TP +.B netns +.RI "{ " PID " | " NETNSNAME " | " NETNSFILE " }" +.br +move the device to the network namespace associated with process +.IR "PID " or +the name +.IR "NETNSNAME " or +the file +.IR "NETNSFILE". + +Some devices are not allowed to change network namespace: loopback, bridge, +wireless. These are network namespace local devices. In such case +.B ip +tool will return "Invalid argument" error. It is possible to find out +if device is local to a single network namespace by checking +.B netns-local +flag in the output of the +.BR ethtool ":" + +.in +8 +.B ethtool -k +.I DEVICE +.in -8 + +To change network namespace for wireless devices the +.B iw +tool can be used. But it allows one to change network namespace only for +physical devices and by process +.IR PID . + +.TP +.BI alias " NAME" +give the device a symbolic name for easy reference. + +.TP +.BI group " GROUP" +specify the group the device belongs to. +The available groups are listed in +.BR @SYSCONF_USR_DIR@/group " or " @SYSCONF_ETC_DIR@/group +(has precedence if exists). + +.TP +.BI vf " NUM" +specify a Virtual Function device to be configured. The associated PF device +must be specified using the +.B dev +parameter. + +.in +8 +.BI mac " LLADDRESS" +- change the station address for the specified VF. The +.B vf +parameter must be specified. + +.sp +.BI vlan " VLANID" +- change the assigned VLAN for the specified VF. When specified, all traffic +sent from the VF will be tagged with the specified VLAN ID. Incoming traffic +will be filtered for the specified VLAN ID, and will have all VLAN tags +stripped before being passed to the VF. Setting this parameter to 0 disables +VLAN tagging and filtering. The +.B vf +parameter must be specified. + +.sp +.BI qos " VLAN-QOS" +- assign VLAN QOS (priority) bits for the VLAN tag. When specified, all VLAN +tags transmitted by the VF will include the specified priority bits in the +VLAN tag. If not specified, the value is assumed to be 0. Both the +.B vf +and +.B vlan +parameters must be specified. Setting both +.B vlan +and +.B qos +as 0 disables VLAN tagging and filtering for the VF. + +.sp +.BI proto " VLAN-PROTO" +- assign VLAN PROTOCOL for the VLAN tag, either 802.1Q or 802.1ad. +Setting to 802.1ad, all traffic sent from the VF will be tagged with +VLAN S-Tag. Incoming traffic will have VLAN S-Tags stripped before +being passed to the VF. Setting to 802.1ad also enables an option to +concatenate another VLAN tag, so both S-TAG and C-TAG will be +inserted/stripped for outgoing/incoming traffic, respectively. If not +specified, the value is assumed to be 802.1Q. Both the +.B vf +and +.B vlan +parameters must be specified. + +.sp +.BI rate " TXRATE" +-- change the allowed transmit bandwidth, in Mbps, for the specified VF. +Setting this parameter to 0 disables rate limiting. +.B vf +parameter must be specified. +Please use new API +.B "max_tx_rate" +option instead. + +.sp +.BI max_tx_rate " TXRATE" +- change the allowed maximum transmit bandwidth, in Mbps, for the +specified VF. Setting this parameter to 0 disables rate limiting. +.B vf +parameter must be specified. + +.sp +.BI min_tx_rate " TXRATE" +- change the allowed minimum transmit bandwidth, in Mbps, for the specified VF. +Minimum TXRATE should be always <= Maximum TXRATE. +Setting this parameter to 0 disables rate limiting. +.B vf +parameter must be specified. + +.sp +.BI spoofchk " on|off" +- turn packet spoof checking on or off for the specified VF. +.sp +.BI query_rss " on|off" +- toggle the ability of querying the RSS configuration of a specific +VF. VF RSS information like RSS hash key may be considered sensitive +on some devices where this information is shared between VF and PF +and thus its querying may be prohibited by default. +.sp +.BI state " auto|enable|disable" +- set the virtual link state as seen by the specified VF. Setting to +auto means a reflection of the PF link state, enable lets the VF to +communicate with other VFs on this host even if the PF link state is +down, disable causes the HW to drop any packets sent by the VF. +.sp +.BI trust " on|off" +- trust the specified VF user. This enables that VF user can set a +specific feature which may impact security and/or +performance. (e.g. VF multicast promiscuous mode) +.sp +.BI node_guid " eui64" +- configure node GUID for Infiniband VFs. +.sp +.BI port_guid " eui64" +- configure port GUID for Infiniband VFs. +.in -8 + +.TP +.B xdp object "|" pinned "|" off +set (or unset) a XDP ("eXpress Data Path") BPF program to run on every +packet at driver level. +.B ip link +output will indicate a +.B xdp +flag for the networking device. If the driver does not have native XDP +support, the kernel will fall back to a slower, driver-independent "generic" +XDP variant. The +.B ip link +output will in that case indicate +.B xdpgeneric +instead of +.B xdp +only. If the driver does have native XDP support, but the program is +loaded under +.B xdpgeneric object "|" pinned +then the kernel will use the generic XDP variant instead of the native one. +.B xdpdrv +has the opposite effect of requestsing that the automatic fallback to the +generic XDP variant be disabled and in case driver is not XDP-capable error +should be returned. +.B xdpdrv +also disables hardware offloads. +.B xdpoffload +in ip link output indicates that the program has been offloaded to hardware +and can also be used to request the "offload" mode, much like +.B xdpgeneric +it forces program to be installed specifically in HW/FW of the apater. + +.B off +(or +.B none +) +- Detaches any currently attached XDP/BPF program from the given device. + +.BI object " FILE " +- Attaches a XDP/BPF program to the given device. The +.I FILE +points to a BPF ELF file (f.e. generated by LLVM) that contains the BPF +program code, map specifications, etc. If a XDP/BPF program is already +attached to the given device, an error will be thrown. If no XDP/BPF +program is currently attached, the device supports XDP and the program +from the BPF ELF file passes the kernel verifier, then it will be attached +to the device. If the option +.I -force +is passed to +.B ip +then any prior attached XDP/BPF program will be atomically overridden and +no error will be thrown in this case. If no +.B section +option is passed, then the default section name ("prog") will be assumed, +otherwise the provided section name will be used. If no +.B verbose +option is passed, then a verifier log will only be dumped on load error. +See also +.B EXAMPLES +section for usage examples. + +.BI section " NAME " +- Specifies a section name that contains the BPF program code. If no section +name is specified, the default one ("prog") will be used. This option is +to be passed with the +.B object +option. + +.BI program " NAME " +- Specifies the BPF program name that need to be attached. When the program +name is specified, the section name parameter will be ignored. This option +only works when iproute2 build with +.B libbpf +support. + +.BI verbose +- Act in verbose mode. For example, even in case of success, this will +print the verifier log in case a program was loaded from a BPF ELF file. + +.BI pinned " FILE " +- Attaches a XDP/BPF program to the given device. The +.I FILE +points to an already pinned BPF program in the BPF file system. The option +.B section +doesn't apply here, but otherwise semantics are the same as with the option +.B object +described already. + +.TP +.BI master " DEVICE" +set master device of the device (enslave device). + +.TP +.BI nomaster +unset master device of the device (release device). + +.TP +.BI addrgenmode " eui64|none|stable_secret|random" +set the IPv6 address generation mode + +.I eui64 +- use a Modified EUI-64 format interface identifier + +.I none +- disable automatic address generation + +.I stable_secret +- generate the interface identifier based on a preset + /proc/sys/net/ipv6/conf/{default,DEVICE}/stable_secret + +.I random +- like stable_secret, but auto-generate a new random secret if none is set + +.TP +.BR "link-netnsid " +set peer netnsid for a cross-netns interface + +.TP +.BI type " ETYPE TYPE_ARGS" +Change type-specific settings. For a list of supported types and arguments refer +to the description of +.B "ip link add" +above. In addition to that, it is possible to manipulate settings to slave +devices: + +.TP +Bridge Slave Support +For a link with master +.B bridge +the following additional arguments are supported: + +.B "ip link set type bridge_slave" +[ +.B fdb_flush +] [ +.BI state " STATE" +] [ +.BI priority " PRIO" +] [ +.BI cost " COST" +] [ +.BR guard " { " on " | " off " }" +] [ +.BR hairpin " { " on " | " off " }" +] [ +.BR fastleave " { " on " | " off " }" +] [ +.BR root_block " { " on " | " off " }" +] [ +.BR learning " { " on " | " off " }" +] [ +.BR flood " { " on " | " off " }" +] [ +.BR proxy_arp " { " on " | " off " }" +] [ +.BR proxy_arp_wifi " { " on " | " off " }" +] [ +.BI mcast_router " MULTICAST_ROUTER" +] [ +.BR mcast_fast_leave " { " on " | " off "}" +] [ +.BR bcast_flood " { " on " | " off " }" +] [ +.BR mcast_flood " { " on " | " off " }" +] [ +.BR mcast_to_unicast " { " on " | " off " }" +] [ +.BR group_fwd_mask " MASK" +] [ +.BR neigh_suppress " { " on " | " off " }" +] [ +.BR neigh_vlan_suppress " { " on " | " off " }" +] [ +.BR vlan_tunnel " { " on " | " off " }" +] [ +.BR isolated " { " on " | " off " }" +] [ +.BR locked " { " on " | " off " }" +] [ +.BR mab " { " on " | " off " }" +] [ +.BR backup_port " DEVICE" +] [ +.BR nobackup_port +] [ +.BR backup_nhid " NHID" +] + +.in +8 +.sp +.B fdb_flush +- flush bridge slave's fdb dynamic entries. + +.BI state " STATE" +- Set port state. +.I STATE +is a number representing the following states: +.BR 0 " (disabled)," +.BR 1 " (listening)," +.BR 2 " (learning)," +.BR 3 " (forwarding)," +.BR 4 " (blocking)." + +.BI priority " PRIO" +- set port priority (allowed values are between 0 and 63, inclusively). + +.BI cost " COST" +- set port cost (allowed values are between 1 and 65535, inclusively). + +.BR guard " { " on " | " off " }" +- block incoming BPDU packets on this port. + +.BR hairpin " { " on " | " off " }" +- enable hairpin mode on this port. This will allow incoming packets on this +port to be reflected back. + +.BR fastleave " { " on " | " off " }" +- enable multicast fast leave on this port. + +.BR root_block " { " on " | " off " }" +- block this port from becoming the bridge's root port. + +.BR learning " { " on " | " off " }" +- allow MAC address learning on this port. + +.BR flood " { " on " | " off " }" +- open the flood gates on this port, i.e. forward all unicast frames to this +port also. Requires +.BR proxy_arp " and " proxy_arp_wifi +to be turned off. + +.BR proxy_arp " { " on " | " off " }" +- enable proxy ARP on this port. + +.BR proxy_arp_wifi " { " on " | " off " }" +- enable proxy ARP on this port which meets extended requirements by IEEE +802.11 and Hotspot 2.0 specifications. + +.BI mcast_router " MULTICAST_ROUTER" +- configure this port for having multicast routers attached. A port with a +multicast router will receive all multicast traffic. +.I MULTICAST_ROUTER +may be either +.B 0 +to disable multicast routers on this port, +.B 1 +to let the system detect the presence of routers (this is the default), +.B 2 +to permanently enable multicast traffic forwarding on this port or +.B 3 +to enable multicast routers temporarily on this port, not depending on incoming +queries. + +.BR mcast_fast_leave " { " on " | " off " }" +- this is a synonym to the +.B fastleave +option above. + +.BR bcast_flood " { " on " | " off " }" +- controls flooding of broadcast traffic on the given port. By default +this flag is on. + +.BR mcast_flood " { " on " | " off " }" +- controls whether a given port will flood multicast traffic for which +there is no MDB entry. By default this flag is on. + +.BR mcast_to_unicast " { " on " | " off " }" +- controls whether a given port will replicate packets using unicast +instead of multicast. By default this flag is off. + +.BI group_fwd_mask " MASK " +- set the group forward mask. This is the bitmask that is applied to +decide whether to forward incoming frames destined to link-local +addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to +0, ie the bridge does not forward any link-local frames coming on +this port). + +.BR neigh_suppress " { " on " | " off " }" +- controls whether neigh discovery (arp and nd) proxy and suppression +is enabled on the port. By default this flag is off. + +.BR neigh_vlan_suppress " { " on " | " off " }" +- controls whether per-VLAN neigh discovery (arp and nd) proxy and suppression +is enabled on the port. When on, the \fBbridge link\fR option +\fBneigh_suppress\fR has no effect and the per-VLAN state is set using the +\fBbridge vlan\fR option \fBneigh_suppress\fR. By default this flag is off. + +.BR vlan_tunnel " { " on " | " off " }" +- controls whether vlan to tunnel mapping is enabled on the port. By +default this flag is off. + +.BR locked " { " on " | " off " }" +- controls whether a port is locked or not. When locked, non-link-local frames +received through the port are dropped unless an FDB entry with the MAC source +address points to the port. The common use case is IEEE 802.1X where hosts can +authenticate themselves by exchanging EAPOL frames with an authenticator. After +authentication is complete, the user space control plane can install a matching +FDB entry to allow traffic from the host to be forwarded by the bridge. When +learning is enabled on a locked port, the +.B no_linklocal_learn +bridge option needs to be on to prevent the bridge from learning from received +EAPOL frames. By default this flag is off. + +.BR mab " { " on " | " off " }" +- controls whether MAC Authentication Bypass (MAB) is enabled on the port or +not. MAB can only be enabled on a locked port that has learning enabled. When +enabled, FDB entries are learned from received traffic and have the "locked" +FDB flag set. The flag can only be set by the kernel and it indicates that the +FDB entry cannot be used to authenticate the corresponding host. User space can +decide to authenticate the host by replacing the FDB entry and clearing the +"locked" FDB flag. Locked FDB entries can roam to unlocked (authorized) ports +in which case the "locked" flag is cleared. FDB entries cannot roam to locked +ports regardless of MAB being enabled or not. Therefore, locked FDB entries are +only created if an FDB entry with the given {MAC, VID} does not already exist. +This behavior prevents unauthenticated hosts from disrupting traffic destined +to already authenticated hosts. Locked FDB entries act like regular dynamic +entries with respect to forwarding and aging. By default this flag is off. + +.BI backup_port " DEVICE" +- if the port loses carrier all traffic will be redirected to the +configured backup port + +.BR nobackup_port +- removes the currently configured backup port + +.BI backup_nhid " NHID" +- the FDB nexthop object ID (see \fBip-nexthop\fR(8)) to attach to packets +being redirected to a backup port that has VLAN tunnel mapping enabled (via the +\fBvlan_tunnel\fR option). Setting a value of 0 (default) has the effect of not +attaching any ID. + +.in -8 + +.TP +Bonding Slave Support +For a link with master +.B bond +the following additional arguments are supported: + +.B "ip link set type bond_slave" +[ +.BI queue_id " ID" +] [ +.BI prio " PRIORITY" +] + +.in +8 +.sp +.BI queue_id " ID" +- set the slave's queue ID (a 16bit unsigned value). + +.sp +.BI prio " PRIORITY" +- set the slave's priority for active slave re-selection during failover +(a 32bit signed value). This option only valid for active-backup(1), +balance-tlb (5) and balance-alb (6) mode. + +.in -8 + +.TP +MACVLAN and MACVTAP Support +Modify list of allowed macaddr for link in source mode. + +.B "ip link set type { macvlan | macvap } " +[ +.BI macaddr " " "" COMMAND " " MACADDR " ..." +] + +Commands: +.in +8 +.B add +- add MACADDR to allowed list +.sp +.B set +- replace allowed list +.sp +.B del +- remove MACADDR from allowed list +.sp +.B flush +- flush whole allowed list +.sp +.in -8 + +Update the broadcast/multicast queue length. + +.B "ip link set type { macvlan | macvap } " +[ +.BI bcqueuelen " LENGTH " +] +[ +.BI bclim " LIMIT " +] + +.in +8 +.BI bcqueuelen " LENGTH " +- Set the length of the RX queue used to process broadcast and multicast packets. +.IR LENGTH " must be a positive integer in the range [0-4294967295]." +Setting a length of 0 will effectively drop all broadcast/multicast traffic. +If not specified the macvlan driver default (1000) is used. +Note that all macvlans that share the same underlying device are using the same +.RB "queue. The parameter here is a " request ", the actual queue length used" +will be the maximum length that any macvlan interface has requested. +When listing device parameters both the bcqueuelen parameter +as well as the actual used bcqueuelen are listed to better help +the user understand the setting. + +.BI bclim " LIMIT " +- Set the threshold for broadcast queueing. +.IR LIMIT " must be a 32-bit integer." +Setting this to -1 disables broadcast queueing altogether. Otherwise +a multicast address will be queued as broadcast if the number of devices +using it is greater than the given value. +.in -8 + +.TP +DSA user port support +For a link having the DSA user port type, the following additional arguments +are supported: + +.B "ip link set type dsa " +[ +.BI conduit " DEVICE" +] + +.in +8 +.sp +.BI conduit " DEVICE" +- change the DSA conduit (host network interface) responsible for handling the +locally terminated traffic for the given DSA switch user port. For a +description of which network interfaces are suitable for serving as conduit +interfaces of this user port, please see +https://docs.kernel.org/networking/dsa/configuration.html#affinity-of-user-ports-to-cpu-ports +as well as what is supported by the driver in use. + +.sp +.BI master " DEVICE" +- this is a synonym for "conduit". + +.in -8 + +.SS ip link show - display device attributes + +.TP +.BI dev " NAME " (default) +.I NAME +specifies the network device to show. + +.TP +.BI group " GROUP " +.I GROUP +specifies what group of devices to show. + +.TP +.B up +only display running interfaces. + +.TP +.BI master " DEVICE " +.I DEVICE +specifies the master device which enslaves devices to show. + +.TP +.BI vrf " NAME " +.I NAME +specifies the VRF which enslaves devices to show. + +.TP +.BI type " TYPE " +.I TYPE +specifies the type of devices to show. + +Note that the type name is not checked against the list of supported types - +instead it is sent as-is to the kernel. Later it is used to filter the returned +interface list by comparing it with the relevant attribute in case the kernel +didn't filter already. Therefore any string is accepted, but may lead to empty +output. + +.TP +.B nomaster +only show devices with no master + +.SS ip link xstats - display extended statistics + +.TP +.BI type " TYPE " +.I TYPE +specifies the type of devices to display extended statistics for. + +.SS ip link afstats - display address-family specific statistics + +.TP +.BI dev " DEVICE " +.I DEVICE +specifies the device to display address-family statistics for. + +.SS ip link help - display help + +.PP +.I "TYPE" +specifies which help of link type to display. + +.SS +.I GROUP +may be a number or a string from +.BR @SYSCONF_USR_DIR@/group " or " @SYSCONF_ETC_DIR@/group +which can be manually filled and has precedence if exists. + +.SH "EXAMPLES" +.PP +ip link show +.RS 4 +Shows the state of all network interfaces on the system. +.RE +.PP +ip link show type bridge +.RS 4 +Shows the bridge devices. +.RE +.PP +ip link show type vlan +.RS 4 +Shows the vlan devices. +.RE +.PP +ip link show master br0 +.RS 4 +Shows devices enslaved by br0 +.RE +.PP +ip link set dev ppp0 mtu 1400 +.RS 4 +Change the MTU the ppp0 device. +.RE +.PP +ip link add link eth0 name eth0.10 type vlan id 10 +.RS 4 +Creates a new vlan device eth0.10 on device eth0. +.RE +.PP +ip link delete dev eth0.10 +.RS 4 +Removes vlan device. +.RE + +ip link help gre +.RS 4 +Display help for the gre link type. +.RE +.PP +ip link add name tun1 type ipip remote 192.168.1.1 +local 192.168.1.2 ttl 225 encap gue encap-sport auto +encap-dport 5555 encap-csum encap-remcsum +.RS 4 +Creates an IPIP that is encapsulated with Generic UDP Encapsulation, +and the outer UDP checksum and remote checksum offload are enabled. +.RE +.PP +ip link set dev eth0 xdp obj prog.o +.RS 4 +Attaches a XDP/BPF program to device eth0, where the program is +located in prog.o, section "prog" (default section). In case a +XDP/BPF program is already attached, throw an error. +.RE +.PP +ip -force link set dev eth0 xdp obj prog.o sec foo +.RS 4 +Attaches a XDP/BPF program to device eth0, where the program is +located in prog.o, section "foo". In case a XDP/BPF program is +already attached, it will be overridden by the new one. +.RE +.PP +ip -force link set dev eth0 xdp pinned /sys/fs/bpf/foo +.RS 4 +Attaches a XDP/BPF program to device eth0, where the program was +previously pinned as an object node into BPF file system under +name foo. +.RE +.PP +ip link set dev eth0 xdp off +.RS 4 +If a XDP/BPF program is attached on device eth0, detach it and +effectively turn off XDP for device eth0. +.RE +.PP +ip link add link wpan0 lowpan0 type lowpan +.RS 4 +Creates a 6LoWPAN interface named lowpan0 on the underlying +IEEE 802.15.4 device wpan0. +.RE +.PP +ip link add dev ip6erspan11 type ip6erspan seq key 102 +local fc00:100::2 remote fc00:100::1 +erspan_ver 2 erspan_dir ingress erspan_hwid 17 +.RS 4 +Creates a IP6ERSPAN version 2 interface named ip6erspan00. +.RE +.PP +ip link set dev swp0 type dsa conduit eth1 +.RS 4 +Changes the conduit interface of the swp0 user port to eth1. +.RE + +.SH SEE ALSO +.br +.BR ip (8), +.BR ip-netns (8), +.BR ethtool (8), +.BR iptables (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-macsec.8 b/man/man8/ip-macsec.8 new file mode 100644 index 0000000..1a14485 --- /dev/null +++ b/man/man8/ip-macsec.8 @@ -0,0 +1,186 @@ +.TH IP\-MACSEC 8 "07 Mar 2016" "iproute" "Linux" +.SH NAME +ip-macsec \- MACsec device configuration +.SH "SYNOPSIS" +.BI "ip link add link " DEVICE " name " NAME " type macsec " +[ [ +.BI address " <lladdr>" +] +.BI port " PORT" +| +.BI sci " <u64>" +] [ +.BR cipher " { " default " | " gcm-aes-128 " | " gcm-aes-256 " | " gcm-aes-xpn-128 " | " gcm-aes-xpn-256 " } ] [" +.BI icvlen " ICVLEN" +] [ +.BR encrypt " { " on " | " off " } ] [" +.BR send_sci " { " on " | " off " } ] [" +.BR end_station " { " on " | " off " } ] [" +.BR scb " { " on " | " off " } ] [" +.BR protect " { " on " | " off " } ] [" +.BR replay " { " on " | " off " } ] [" +.BI window " WINDOW" +] [ +.BR validate " { " strict " | " check " | " disabled " } ] [" +.BI encodingsa " SA" +] [ +.BR offload " { " off " | " phy " | " mac " }" +] + +.BI "ip macsec add " DEV " tx sa" +.RI "{ " 0..3 " } [ " OPTS " ]" +.BI key " ID KEY" +.br +.BI "ip macsec set " DEV " tx sa" +.RI "{ " 0..3 " } [ " OPTS " ]" +.br +.BI "ip macsec del " DEV " tx sa" +.RI "{ " 0..3 " }" + +.BI "ip macsec add " DEV " rx " SCI +.RB [ " on " | " off " ] +.br +.BI "ip macsec set " DEV " rx " SCI +.RB [ " on " | " off " ] +.br +.BI "ip macsec del " DEV " rx " SCI + +.BI "ip macsec add " DEV " rx " SCI " sa" +.RI "{ " 0..3 " } [ " OPTS " ]" +.BI key " ID KEY" +.br +.BI "ip macsec set " DEV " rx " SCI " sa" +.RI "{ " 0..3 " } [ " OPTS " ]" +.br +.BI "ip macsec del " DEV " rx " SCI " sa" +.RI "{ " 0..3 " }" + +.BI "ip macsec offload " DEV +.RB "{ " off " | " phy " | " mac " }" + +.B ip macsec show +.RI [ " DEV " ] + +.IR OPTS " := [ " +.BR pn " { " +.IR 1..2^32-1 " } |" +.BR xpn " { " +.IR 1..2^64-1 " } ] [" +.B salt +.IR SALT " ] [" +.B ssci +.IR <u32> " ] [" +.BR on " | " off " ]" +.br +.IR SCI " := { " +.B sci +.IR <u64> " | " +.BI port +.IR PORT +.BI address " <lladdr> " +} +.br +.IR PORT " := { " 1..2^16-1 " } " +.br +.IR SALT " := 96-bit hex string " + + +.SH DESCRIPTION +The +.B ip macsec +commands are used to configure transmit secure associations and receive secure channels and their secure associations on a MACsec device created with the +.B ip link add +command using the +.I macsec +type. + +.SH EXAMPLES +.PP +.SS Create a MACsec device on link eth0 (offload is disabled by default) +.nf +# ip link add link eth0 macsec0 type macsec port 11 encrypt on +.PP +.SS Configure a secure association on that device +.nf +# ip macsec add macsec0 tx sa 0 pn 1024 on key 01 81818181818181818181818181818181 +.PP +.SS Configure a receive channel +.nf +# ip macsec add macsec0 rx port 1234 address c6:19:52:8f:e6:a0 +.PP +.SS Configure a receive association +.nf +# ip macsec add macsec0 rx port 1234 address c6:19:52:8f:e6:a0 sa 0 pn 1 on key 00 82828282828282828282828282828282 +.PP +.SS Display MACsec configuration +.nf +# ip macsec show +.PP +.SS Configure offloading on an interface +.nf +# ip macsec offload macsec0 phy +.PP +.SS Configure offloading upon MACsec device creation +.nf +# ip link add link eth0 macsec0 type macsec port 11 encrypt on offload mac + +.SH EXTENDED PACKET NUMBER EXAMPLES +.PP +.SS Create a MACsec device on link eth0 with enabled extended packet number (offload is disabled by default) +.nf +# ip link add link eth0 macsec0 type macsec port 11 encrypt on cipher gcm-aes-xpn-128 +.PP +.SS Configure a secure association on that device +.nf +# ip macsec add macsec0 tx sa 0 xpn 1024 on salt 838383838383838383838383 ssci 123 key 01 81818181818181818181818181818181 +.PP +.SS Configure a receive channel +.nf +# ip macsec add macsec0 rx port 11 address c6:19:52:8f:e6:a0 +.PP +.SS Configure a receive association +.nf +# ip macsec add macsec0 rx port 11 address c6:19:52:8f:e6:a0 sa 0 xpn 1 on salt 838383838383838383838383 ssci 123 key 00 82828282828282828282828282828282 +.PP +.SS Display MACsec configuration +.nf +# ip macsec show +.PP + +.SH NOTES +This tool can be used to configure the 802.1AE keys of the interface. Note that 802.1AE uses GCM-AES +with a initialization vector (IV) derived from the packet number. The same key must not be used +with the same IV more than once. Instead, keys must be frequently regenerated and distributed. +This tool is thus mostly for debugging and testing, or in combination with a user-space application +that reconfigures the keys. It is wrong to just configure the keys statically and assume them to work +indefinitely. The suggested and standardized way for key management is 802.1X-2010, which is implemented +by wpa_supplicant. + +.SH EXTENDED PACKET NUMBER NOTES +Passing cipher +.B gcm-aes-xpn-128 +or +.B gcm-aes-xpn-256 +to +.B ip link add +command using the +.I macsec +type requires using the keyword +.B 'xpn' +instead of +.B 'pn' +in addition to providing a salt using the +.B 'salt' +keyword and ssci using the +.B 'ssci' +keyword when using the +.B ip macsec +command. + + +.SH SEE ALSO +.br +.BR ip-link (8) +.BR wpa_supplicant (8) +.SH AUTHOR +Sabrina Dubroca <sd@queasysnail.net> diff --git a/man/man8/ip-maddress.8 b/man/man8/ip-maddress.8 new file mode 100644 index 0000000..f3432bb --- /dev/null +++ b/man/man8/ip-maddress.8 @@ -0,0 +1,59 @@ +.TH IP\-MADDRESS 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-maddress \- multicast addresses management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B maddress +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 + +.BR "ip maddress" " [ " add " | " del " ]" +.IB MULTIADDR " dev " NAME + +.ti -8 +.BR "ip maddress show" " [ " dev +.IR NAME " ]" + +.SH DESCRIPTION +.B maddress +objects are multicast addresses. + +.SS ip maddress show - list multicast addresses + +.TP +.BI dev " NAME " (default) +the device name. + +.TP +.B ip maddress add - add a multicast address +.TP +.B ip maddress delete - delete a multicast address +.sp +These commands attach/detach a static link-layer multicast address +to listen on the interface. +Note that it is impossible to join protocol multicast groups +statically. This command only manages link-layer addresses. + +.RS +.TP +.BI address " LLADDRESS " (default) +the link-layer multicast address. + +.TP +.BI dev " NAME" +the device to join/leave this multicast address. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-monitor.8 b/man/man8/ip-monitor.8 new file mode 100644 index 0000000..ec033c6 --- /dev/null +++ b/man/man8/ip-monitor.8 @@ -0,0 +1,133 @@ +.TH IP\-MONITOR 8 "13 Dec 2012" "iproute2" "Linux" +.SH "NAME" +ip-monitor, rtmon \- state monitoring +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.BR "ip monitor" " [ " all " |" +.IR OBJECT-LIST " ] [" +.BI file " FILENAME " +] [ +.BI label +] [ +.BI all-nsid +] [ +.BI dev " DEVICE " +] +.sp + +.SH OPTIONS + +.TP +.BR "\-t" , " \-timestamp" +Prints timestamp before the event message on the separated line in format: + Timestamp: <Day> <Month> <DD> <hh:mm:ss> <YYYY> <usecs> usec + <EVENT> + +.TP +.BR "\-ts" , " \-tshort" +Prints short timestamp before the event message on the same line in format: + [<YYYY>-<MM>-<DD>T<hh:mm:ss>.<ms>] <EVENT> + +.SH DESCRIPTION +The +.B ip +utility can monitor the state of devices, addresses +and routes continuously. This option has a slightly different format. +Namely, the +.B monitor +command is the first in the command line and then the object list follows: + +.BR "ip monitor" " [ " all " |" +.IR OBJECT-LIST " ] [" +.BI file " FILENAME " +] [ +.BI label +] [ +.BI all-nsid +] [ +.BI dev " DEVICE " +] + +.I OBJECT-LIST +is the list of object types that we want to monitor. +It may contain +.BR link ", " address ", " route ", " mroute ", " prefix ", " +.BR neigh ", " netconf ", " rule ", " stats ", " nsid " and " nexthop "." +If no +.B file +argument is given, +.B ip +opens RTNETLINK, listens on it and dumps state changes in the format +described in previous sections. + +.P +If the +.BI label +option is set, a prefix is displayed before each message to +show the family of the message. For example: +.sp +.in +2 +[NEIGH]10.16.0.112 dev eth0 lladdr 00:04:23:df:2f:d0 REACHABLE +[LINK]3: eth1: <BROADCAST,MULTICAST> mtu 1500 qdisc pfifo_fast state DOWN group default + link/ether 52:54:00:12:34:57 brd ff:ff:ff:ff:ff:ff +.in -2 +.sp + +.P +If the +.BI all-nsid +option is set, the program listens to all network namespaces that have a +nsid assigned into the network namespace were the program is running. +A prefix is displayed to show the network namespace where the message +originates. Example: +.sp +.in +2 +[nsid 0]10.16.0.112 dev eth0 lladdr 00:04:23:df:2f:d0 REACHABLE +.in -2 +.sp + +.P +If the +.BI file +option is given, the program does not listen on RTNETLINK, +but opens the given file, and dumps its contents. The file +should contain RTNETLINK messages saved in binary format. +Such a file can be generated with the +.B rtmon +utility. This utility has a command line syntax similar to +.BR "ip monitor" . +Ideally, +.B rtmon +should be started before the first network configuration command +is issued. F.e. if you insert: +.sp +.in +8 +rtmon file /var/log/rtmon.log +.in -8 +.sp +in a startup script, you will be able to view the full history +later. + +.P +Nevertheless, it is possible to start +.B rtmon +at any time. +It prepends the history with the state snapshot dumped at the moment +of starting. + +.P +If the +.BI dev +option is given, the program prints only events related to this device. + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> +.br +Manpage revised by Nicolas Dichtel <nicolas.dichtel@6wind.com> diff --git a/man/man8/ip-mptcp.8 b/man/man8/ip-mptcp.8 new file mode 100644 index 0000000..6c70895 --- /dev/null +++ b/man/man8/ip-mptcp.8 @@ -0,0 +1,235 @@ +'\" t +.TH IP\-MPTCP 8 "4 Apr 2020" "iproute2" "Linux" +.SH "NAME" +ip-mptcp \- MPTCP path manager configuration +.SH "SYNOPSIS" +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B mptcp +.RB "{ " +.B endpoint +.RB " | " +.B limits +.RB " | " +.B help +.RB " }" +.sp + +.ti -8 +.BR "ip mptcp endpoint add " +.IR IFADDR +.RB "[ " port +.IR PORT " ]" +.RB "[ " dev +.IR IFNAME " ]" +.RB "[ " id +.I ID +.RB "] [ " +.I FLAG-LIST +.RB "] " + +.ti -8 +.BR "ip mptcp endpoint delete id " +.I ID +.RB "[ " +.I IFADDR +.RB "] " + +.ti -8 +.BR "ip mptcp endpoint change " +.RB "[ " id +.I ID +.RB "] [ " +.IR IFADDR +.RB "] [ " port +.IR PORT " ]" +.RB "CHANGE-OPT" + +.ti -8 +.BR "ip mptcp endpoint show " +.RB "[ " id +.I ID +.RB "]" + +.ti -8 +.BR "ip mptcp endpoint flush" + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " := [" +.B signal +.RB "|" +.B subflow +.RB "|" +.B backup +.RB "|" +.B fullmesh +.RB "]" + +.ti -8 +.IR CHANGE-OPT " := [" +.B backup +.RB "|" +.B nobackup +.RB "|" +.B fullmesh +.RB "|" +.B nofullmesh +.RB "]" + +.ti -8 +.BR "ip mptcp limits set " +.RB "[ " +.B subflow +.IR SUBFLOW_NR " ]" +.RB "[ " +.B add_addr_accepted +.IR ADD_ADDR_ACCEPTED_NR " ]" + +.ti -8 +.BR "ip mptcp limits show" + +.ti -8 +.BR "ip mptcp monitor" + +.SH DESCRIPTION + +MPTCP is a transport protocol built on top of TCP that allows TCP +connections to use multiple paths to maximize resource usage and increase +redundancy. The ip-mptcp sub-commands allow configuring several aspects of the +MPTCP path manager, which is in charge of subflows creation: + +.P +The +.B endpoint +object specifies the IP addresses that will be used and/or announced for +additional subflows: + +.TS +l l. +ip mptcp endpoint add add new MPTCP endpoint +ip mptcp endpoint delete delete existing MPTCP endpoint +ip mptcp endpoint show get existing MPTCP endpoint +ip mptcp endpoint flush flush all existing MPTCP endpoints +.TE + +.TP +.IR IFADDR +An IPv4 or IPv6 address. When used with the +.B delete id +operation, an +.B IFADDR +is only included when the +.B ID +is 0. + +.TP +.IR PORT +When a port number is specified, incoming MPTCP subflows for already +established MPTCP sockets will be accepted on the specified port, regardless +the original listener port accepting the first MPTCP subflow and/or +this peer being actually on the client side. + +.TP +.IR ID +is a unique numeric identifier for the given endpoint + +.TP +.BR signal +The endpoint will be announced/signaled to each peer via an MPTCP ADD_ADDR +sub-option. Upon reception of an ADD_ADDR sub-option, the peer can try to +create additional subflows, see +.BR ADD_ADDR_ACCEPTED_NR. + +.TP +.BR subflow +If additional subflow creation is allowed by the MPTCP limits, the MPTCP +path manager will try to create an additional subflow using this endpoint +as the source address after the MPTCP connection is established. + +.TP +.BR backup +If this is a +.BR subflow +endpoint, the subflows created using this endpoint will have the backup +flag set during the connection process. This flag instructs the peer to +only send data on a given subflow when all non-backup subflows are +unavailable. This does not affect outgoing data, where subflow priority +is determined by the backup/non-backup flag received from the peer + +.TP +.BR fullmesh +If this is a +.BR subflow +endpoint and additional subflow creation is allowed by the MPTCP limits, +the MPTCP path manager will try to create an additional subflow for each +known peer address, using this endpoint as the source address. This will +occur after the MPTCP connection is established. If the peer did not +announce any additional addresses using the MPTCP ADD_ADDR sub-option, +this will behave the same as a plain +.BR subflow +endpoint. When the peer does announce addresses, each received ADD_ADDR +sub-option will trigger creation of an additional subflow to generate a +full mesh topology. + +.TP +.BR implicit +In some scenarios, an MPTCP +.BR subflow +can use a local address mapped by a implicit endpoint created by the +in-kernel path manager. Once set, the implicit flag cannot be removed, but +other flags can be added to the endpoint. Implicit endpoints cannot be +created from user-space. + +.sp +.PP +The +.B limits +object specifies the constraints for subflow creations: + +.TS +l l. +ip mptcp limits show get current MPTCP subflow creation limits +ip mptcp limits set change the MPTCP subflow creation limits +.TE + +.TP +.IR SUBFLOW_NR +specifies the maximum number of additional subflows allowed for each MPTCP +connection. Additional subflows can be created due to: incoming accepted +ADD_ADDR sub-option, local +.BR subflow +endpoints, additional subflows started by the peer. + +.TP +.IR ADD_ADDR_ACCEPTED_NR +specifies the maximum number of incoming ADD_ADDR sub-options accepted for +each MPTCP connection. After receiving the specified number of ADD_ADDR +sub-options, any other incoming one will be ignored for the MPTCP connection +lifetime. When an ADD_ADDR sub-option is accepted and there are no local +.IR fullmesh +endpoints, the MPTCP path manager will try to create a new subflow using the +address in the ADD_ADDR sub-option as the destination address and a source +address determined using local routing resolution +When +.IR fullmesh +endpoints are available, the MPTCP path manager will try to create new subflows +using each +.IR fullmesh +endpoint as a source address and the peer's ADD_ADDR address as the destination. +In both cases the +.IR SUBFLOW_NR +limit is enforced. + +.sp +.PP +.B monitor +displays creation and deletion of MPTCP connections as well as addition or removal of remote addresses and subflows. + +.SH AUTHOR +Original Manpage by Paolo Abeni <pabeni@redhat.com> diff --git a/man/man8/ip-mroute.8 b/man/man8/ip-mroute.8 new file mode 100644 index 0000000..b64e30d --- /dev/null +++ b/man/man8/ip-mroute.8 @@ -0,0 +1,58 @@ +.TH IP\-MROUTE 8 "13 Dec 2012" "iproute2" "Linux" +.SH "NAME" +ip-mroute \- multicast routing cache management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.BR "ip mroute show" " [ [ " +.BR " to " " ] " +.IR PREFIX " ] [ " +.B from +.IR PREFIX " ] [ " +.B iif +.IR DEVICE " ] [ " +.B table +.IR TABLE_ID " ] " + +.SH DESCRIPTION +.B mroute +objects are multicast routing cache entries created by a user-level +mrouting daemon (f.e. +.B pimd +or +.B mrouted +). + +Due to the limitations of the current interface to the multicast routing +engine, it is impossible to change +.B mroute +objects administratively, so we can only display them. This limitation +will be removed in the future. + +.SS ip mroute show - list mroute cache entries + +.TP +.BI to " PREFIX " (default) +the prefix selecting the destination multicast addresses to list. + +.TP +.BI iif " NAME" +the interface on which multicast packets are received. + +.TP +.BI from " PREFIX" +the prefix selecting the IP source addresses of the multicast route. + +.TP +.BI table " TABLE_ID" +the table id selecting the multicast table. It can be +.BR local ", " main ", " default ", " all " or a number." + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-neighbour.8 b/man/man8/ip-neighbour.8 new file mode 100644 index 0000000..6fed47c --- /dev/null +++ b/man/man8/ip-neighbour.8 @@ -0,0 +1,303 @@ +.TH IP\-NEIGHBOUR 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-neighbour \- neighbour/arp tables management. +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B neigh +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip neigh" " { " add " | " del " | " change " | " replace " } { " +.IR ADDR " [ " +.B lladdr +.IR LLADDR " ] [ " +.B nud +.IR STATE " ] |" +.B proxy +.IR ADDR " } [ " +.B dev +.IR DEV " ] [ " +.BR router " ] [ " +.BR use " ] [ " +.BR managed " ] [ " +.BR extern_learn " ]" + +.ti -8 +.BR "ip neigh" " { " show " | " flush " } [ " proxy " ] [ " to +.IR PREFIX " ] [ " +.B dev +.IR DEV " ] [ " +.B nud +.IR STATE " ] [ " +.B vrf +.IR NAME " ] [" +.BR nomaster " ]" + +.ti -8 +.B ip neigh get +.IR ADDR +.B dev +.IR DEV + +.ti -8 +.IR STATE " := {" +.BR permanent " | " noarp " | " stale " | " reachable " | " none " |" +.BR incomplete " | " delay " | " probe " | " failed " }" + +.SH DESCRIPTION +The +.B ip neigh +command manipulates +.I neighbour +objects that establish bindings between protocol addresses and +link layer addresses for hosts sharing the same link. +Neighbour entries are organized into tables. The IPv4 neighbour table +is also known by another name - the ARP table. + +.P +The corresponding commands display neighbour bindings +and their properties, add new neighbour entries and delete old ones. + +.TP +ip neighbour add +add a new neighbour entry +.TP +ip neighbour change +change an existing entry +.TP +ip neighbour replace +add a new entry or change an existing one +.RS +.PP +These commands create new neighbour records or update existing ones. + +.TP +.BI to " ADDRESS " (default) +the protocol address of the neighbour. It is either an IPv4 or IPv6 address. + +.TP +.BI dev " NAME" +the interface to which this neighbour is attached. + +.TP +.BI proxy +indicates whether we are proxying for this neighbour entry + +.TP +.BI router +indicates whether neighbour is a router + +.TP +.BI use +this neigh entry is in "use". This option can be used to indicate to +the kernel that a controller is using this dynamic entry. If the entry +does not exist, the kernel will resolve it. If it exists, an attempt +to refresh the neighbor entry will be triggered. + +.TP +.BI managed +this neigh entry is "managed". This option can be used to indicate to +the kernel that a controller is using this dynamic entry. In contrast +to "use", if the entry does not exist, the kernel will resolve it and +periodically attempt to auto-refresh the neighbor entry such that it +remains in resolved state when possible. + +.TP +.BI extern_learn +this neigh entry was learned externally. This option can be used to +indicate to the kernel that this is a controller learnt dynamic entry. +Kernel will not gc such an entry. + +.TP +.BI lladdr " LLADDRESS" +the link layer address of the neighbour. +.I LLADDRESS +can also be +.BR "null" . + +.TP +.BI nud " STATE" +the state of the neighbour entry. +.B nud +is an abbreviation for 'Neighbour Unreachability Detection'. +The state can take one of the following values: + +.RS +.TP +.B permanent +the neighbour entry is valid forever and can be only +be removed administratively. +.TP +.B noarp +the neighbour entry is valid. No attempts to validate +this entry will be made but it can be removed when its lifetime expires. +.TP +.B reachable +the neighbour entry is valid until the reachability +timeout expires. +.TP +.B stale +the neighbour entry is valid but suspicious. +This option to +.B ip neigh +does not change the neighbour state if it was valid and the address +is not changed by this command. +.TP +.B none +this is a pseudo state used when initially creating a neighbour entry or after +trying to remove it before it becomes free to do so. +.TP +.B incomplete +the neighbour entry has not (yet) been validated/resolved. +.TP +.B delay +neighbor entry validation is currently delayed. +.TP +.B probe +neighbor is being probed. +.TP +.B failed +max number of probes exceeded without success, neighbor validation has +ultimately failed. +.RE +.RE + +.TP +ip neighbour delete +delete a neighbour entry +.RS +.PP +The arguments are the same as with +.BR "ip neigh add" , +except that +.B lladdr +and +.B nud +are ignored. + +.PP +.B Warning: +Attempts to delete or manually change a +.B noarp +entry created by the kernel may result in unpredictable behaviour. +Particularly, the kernel may try to resolve this address even +on a +.B NOARP +interface or if the address is multicast or broadcast. +.RE + +.TP +ip neighbour show +list neighbour entries +.RS +.TP +.BI to " ADDRESS " (default) +the prefix selecting the neighbours to list. + +.TP +.BI dev " NAME" +only list the neighbours attached to this device. + +.TP +.BI vrf " NAME" +only list the neighbours for given VRF. + +.TP +.BI nomaster +only list neighbours attached to an interface with no master. + +.TP +.BI proxy +list neighbour proxies. + +.TP +.B unused +only list neighbours which are not currently in use. + +.TP +.BI nud " STATE" +only list neighbour entries in this state. +.I NUD_STATE +takes values listed below or the special value +.B all +which means all states. This option may occur more than once. +If this option is absent, +.B ip +lists all entries except for +.B none +and +.BR "noarp" . +.RE + +.TP +ip neighbour flush +flush neighbour entries +.RS +This command has the same arguments as +.B show. +The differences are that it does not run when no arguments are given, +and that the default neighbour states to be flushed do not include +.B permanent +and +.BR "noarp" . + +.PP +With the +.B -statistics +option, the command becomes verbose. It prints out the number of +deleted neighbours and the number of rounds made to flush the +neighbour table. If the option is given +twice, +.B ip neigh flush +also dumps all the deleted neighbours. +.RE + +.TP +ip neigh get +lookup a neighbour entry to a destination given a device +.RS + +.TP +.BI proxy +indicates whether we should lookup a proxy neighbour entry + +.TP +.BI to " ADDRESS " (default) +the prefix selecting the neighbour to query. + +.TP +.BI dev " NAME" +get neighbour entry attached to this device. +.RE + +.SH EXAMPLES +.PP +ip neighbour +.RS +Shows the current neighbour table in kernel. +.RE +.PP +ip neigh flush dev eth0 +.RS +Removes entries in the neighbour table on device eth0. +.RE +.PP +ip neigh get 10.0.1.10 dev eth0 +.RS +Performs a neighbour lookup in the kernel and returns +a neighbour entry. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-netconf.8 b/man/man8/ip-netconf.8 new file mode 100644 index 0000000..7fe3e5f --- /dev/null +++ b/man/man8/ip-netconf.8 @@ -0,0 +1,36 @@ +.TH IP\-NETCONF 8 "13 Dec 2012" "iproute2" "Linux" +.SH "NAME" +ip-netconf \- network configuration monitoring +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.BR "ip " " [ ip-OPTIONS ] " "netconf show" " [ " +.B dev +.IR NAME " ]" + +.SH DESCRIPTION +The +.B ip netconf +utility can monitor IPv4 and IPv6 parameters (see +.BR "/proc/sys/net/ipv[4|6]/conf/[all|DEV]/" ")" +like forwarding, rp_filter, proxy_neigh, ignore_routes_with_linkdown +or mc_forwarding status. + +If no interface is specified, the entry +.B all +is displayed. + +.SS ip netconf show - display network parameters + +.TP +.BI dev " NAME" +the name of the device to display network parameters for. + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Nicolas Dichtel <nicolas.dichtel@6wind.com> diff --git a/man/man8/ip-netns.8.in b/man/man8/ip-netns.8.in new file mode 100644 index 0000000..2911bdd --- /dev/null +++ b/man/man8/ip-netns.8.in @@ -0,0 +1,271 @@ +.TH IP\-NETNS 8 "16 Jan 2013" "iproute2" "Linux" +.SH NAME +ip-netns \- process network namespace management +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B netns +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 +.BR "ip netns" " [ " list " ]" + +.ti -8 +.B ip netns add +.I NETNSNAME + +.ti -8 +.B ip netns attach +.I NETNSNAME PID + +.ti -8 +.B ip [-all] netns del +.RI "[ " NETNSNAME " ]" + +.ti -8 +.B ip netns set +.I NETNSNAME NETNSID + +.ti -8 +.IR NETNSID " := " auto " | " POSITIVE-INT + +.ti -8 +.BR "ip netns identify" +.RI "[ " PID " ]" + +.ti -8 +.BR "ip netns pids" +.I NETNSNAME + +.ti -8 +.BR "ip [-all] netns exec " +.RI "[ " NETNSNAME " ] " command ... + +.ti -8 +.BR "ip netns monitor" + +.ti -8 +.BR "ip netns list-id" +.RI "[ target-nsid " POSITIVE-INT " ] [ nsid " POSITIVE-INT " ]" + +.SH DESCRIPTION +A network namespace is logically another copy of the network stack, +with its own routes, firewall rules, and network devices. + +By default a process inherits its network namespace from its parent. Initially all +the processes share the same default network namespace from the init process. + +By convention a named network namespace is an object at +.BR "@NETNS_RUN_DIR@/" NAME +that can be opened. The file descriptor resulting from opening +.BR "@NETNS_RUN_DIR@/" NAME +refers to the specified network namespace. Holding that file +descriptor open keeps the network namespace alive. The file +descriptor can be used with the +.B setns(2) +system call to change the network namespace associated with a task. + +For applications that are aware of network namespaces, the convention +is to look for global network configuration files first in +.BR "@NETNS_ETC_DIR@/" NAME "/" +then in +.BR "/etc/". +For example, if you want a different version of +.BR /etc/resolv.conf +for a network namespace used to isolate your vpn you would name it +.BR @NETNS_ETC_DIR@/myvpn/resolv.conf. + +.B ip netns exec +automates handling of this configuration, file convention for network +namespace unaware applications, by creating a mount namespace and +bind mounting all of the per network namespace configure files into +their traditional location in /etc. + +.TP +.B ip netns list - show all of the named network namespaces +.sp +This command displays all of the network namespaces in @NETNS_RUN_DIR@ + +.TP +.B ip netns add NAME - create a new named network namespace +.sp +If NAME is available in @NETNS_RUN_DIR@ this command creates a new +network namespace and assigns NAME. + +.TP +.B ip netns attach NAME PID - create a new named network namespace +.sp +If NAME is available in @NETNS_RUN_DIR@ this command attaches the network +namespace of the process PID to NAME as if it were created with ip netns. + +.TP +.B ip [-all] netns delete [ NAME ] - delete the name of a network namespace(s) +.sp +If NAME is present in @NETNS_RUN_DIR@ it is umounted and the mount +point is removed. If this is the last user of the network namespace the +network namespace will be freed and all physical devices will be moved to the +default one, otherwise the network namespace persists until it has no more +users. ip netns delete may fail if the mount point is in use in another mount +namespace. + +If +.B -all +option was specified then all the network namespace names will be removed. + +It is possible to lose the physical device when it was moved to netns and +then this netns was deleted with a running process: + +.RS 10 +$ ip netns add net0 +.RE +.RS 10 +$ ip link set dev eth0 netns net0 +.RE +.RS 10 +$ ip netns exec net0 SOME_PROCESS_IN_BACKGROUND +.RE +.RS 10 +$ ip netns del net0 +.RE + +.RS +and eth0 will appear in the default netns only after SOME_PROCESS_IN_BACKGROUND +will exit or will be killed. To prevent this the processes running in net0 +should be killed before deleting the netns: + +.RE +.RS 10 +$ ip netns pids net0 | xargs kill +.RE +.RS 10 +$ ip netns del net0 +.RE + +.TP +.B ip netns set NAME NETNSID - assign an id to a peer network namespace +.sp +This command assigns a id to a peer network namespace. This id is valid +only in the current network namespace. +If the keyword "auto" is specified an available nsid will be chosen. +This id will be used by the kernel in some netlink messages. If no id is +assigned when the kernel needs it, it will be automatically assigned by +the kernel. +Once it is assigned, it's not possible to change it. + +.TP +.B ip netns identify [PID] - Report network namespaces names for process +.sp +This command walks through @NETNS_RUN_DIR@ and finds all the network +namespace names for network namespace of the specified process, if PID is +not specified then the current process will be used. + +.TP +.B ip netns pids NAME - Report processes in the named network namespace +.sp +This command walks through proc and finds all of the process who have +the named network namespace as their primary network namespace. + +.TP +.B ip [-all] netns exec [ NAME ] cmd ... - Run cmd in the named network namespace +.sp +This command allows applications that are network namespace unaware +to be run in something other than the default network namespace with +all of the configuration for the specified network namespace appearing +in the customary global locations. A network namespace and bind mounts +are used to move files from their network namespace specific location +to their default locations without affecting other processes. + +If +.B -all +option was specified then +.B cmd +will be executed synchronously on the each named network namespace even if +.B cmd +fails on some of them. Network namespace name is printed on each +.B cmd +executing. + +.TP +.B ip netns monitor - Report as network namespace names are added and deleted +.sp +This command watches network namespace name addition and deletion events +and prints a line for each event it sees. + +.TP +.B ip netns list-id [target-nsid POSITIVE-INT] [nsid POSITIVE-INT] - list network namespace ids (nsid) +.sp +Network namespace ids are used to identify a peer network namespace. This +command displays nsids of the current network namespace and provides the +corresponding iproute2 netns name (from @NETNS_RUN_DIR@) if any. + +The +.B target-nsid +option enables to display nsids of the specified network namespace instead of the current network +namespace. This +.B target-nsid +is a nsid from the current network namespace. + +The +.B nsid +option enables to display only this nsid. It is a nsid from the current network namespace. In +combination with the +.B target-nsid +option, it enables to convert a specific nsid from the current network namespace to a nsid of the +.B target-nsid +network namespace. + +.SH EXAMPLES +.PP +ip netns list +.RS +Shows the list of current named network namespaces +.RE +.PP +ip netns add vpn +.RS +Creates a network namespace and names it vpn +.RE +.PP +ip netns exec vpn ip link set lo up +.RS +Bring up the loopback interface in the vpn network namespace. +.RE +.PP +ip netns add foo +.br +ip netns add bar +.br +ip netns set foo 12 +.br +ip netns set bar 13 +.br +ip -n foo netns set foo 22 +.br +ip -n foo netns set bar 23 +.br +ip -n bar netns set foo 32 +.br +ip -n bar netns set bar 33 +.br +ip netns list-id target-nsid 12 +.RS +Shows the list of nsids from the network namespace foo. +.RE +ip netns list-id target-nsid 12 nsid 13 +.RS +Get nsid of bar from the network namespace foo (result is 23). +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Eric W. Biederman +.br +Manpage revised by Nicolas Dichtel <nicolas.dichtel@6wind.com> diff --git a/man/man8/ip-nexthop.8 b/man/man8/ip-nexthop.8 new file mode 100644 index 0000000..f81a591 --- /dev/null +++ b/man/man8/ip-nexthop.8 @@ -0,0 +1,327 @@ +.TH IP\-NEXTHOP 8 "30 May 2019" "iproute2" "Linux" +.SH "NAME" +ip-nexthop \- nexthop object management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " ip-OPTIONS " ]" +.B nexthop +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 + +.ti -8 +.BR "ip nexthop" " { " +.BR show " | " flush " } " +.I SELECTOR + +.ti -8 +.BR "ip nexthop" " { " add " | " replace " } id " +.I ID +.IR NH + +.ti -8 +.BR "ip nexthop" " { " get " | " del " } id " +.I ID + +.ti -8 +.BI "ip nexthop bucket list " BUCKET_SELECTOR + +.ti -8 +.BR "ip nexthop bucket get " id +.I ID +.RI "index " INDEX + +.ti -8 +.IR SELECTOR " := " +.RB "[ " id +.IR ID " ] [ " +.B dev +.IR DEV " ] [ " +.B vrf +.IR NAME " ] [ " +.B master +.IR DEV " ] [ " +.BR groups " ] [ " +.BR fdb " ]" + +.ti -8 +.IR BUCKET_SELECTOR " := " +.IR SELECTOR +.RB " | [ " nhid +.IR ID " ]" + +.ti -8 +.IR NH " := { " +.BR blackhole " | [ " +.B via +.IR ADDRESS " ] [ " +.B dev +.IR DEV " ] [ " +.BR onlink " ] [ " +.B encap +.IR ENCAP " ] [ " +.BR fdb " ] | " +.B group +.IR GROUP " [ " +.BR fdb " ] [ " +.B type +.IR TYPE " [ " TYPE_ARGS " ] ] }" + +.ti -8 +.IR ENCAP " := [ " +.IR ENCAP_MPLS " ] " + +.ti -8 +.IR ENCAP_MPLS " := " +.BR mpls " [ " +.IR LABEL " ] [" +.B ttl +.IR TTL " ]" + +.ti -8 +.IR GROUP " := " +.BR id "[," weight "[/...]" + +.ti -8 +.IR TYPE " := { " +.BR mpath " | " resilient " }" + +.ti -8 +.IR TYPE_ARGS " := [ " +.IR RESILIENT_ARGS " ] " + +.ti -8 +.IR RESILIENT_ARGS " := " +.RB "[ " buckets +.IR BUCKETS " ] [ " +.B idle_timer +.IR IDLE " ] [ " +.B unbalanced_timer +.IR UNBALANCED " ]" + +.SH DESCRIPTION +.B ip nexthop +is used to manipulate entries in the kernel's nexthop tables. +.TP +ip nexthop add id ID +add new nexthop entry +.TP +ip nexthop replace id ID +change the configuration of a nexthop or add new one +.RS +.TP +.BI via " [ FAMILY ] ADDRESS" +the address of the nexthop router, in the address family FAMILY. +Address family must match address family of nexthop instance. +.TP +.BI dev " NAME" +is the output device. +.TP +.B onlink +pretend that the nexthop is directly attached to this link, +even if it does not match any interface prefix. +.TP +.BI encap " ENCAPTYPE ENCAPHDR" +attach tunnel encapsulation attributes to this route. +.sp +.I ENCAPTYPE +is a string specifying the supported encapsulation type. Namely: + +.in +8 +.BI mpls +- encapsulation type MPLS +.sp +.in -8 +.I ENCAPHDR +is a set of encapsulation attributes specific to the +.I ENCAPTYPE. + +.in +8 +.B mpls +.in +2 +.I MPLSLABEL +- mpls label stack with labels separated by +.I "/" +.sp + +.B ttl +.I TTL +- TTL to use for MPLS header or 0 to inherit from IP header +.in -2 + +.TP +.BI group " GROUP [ " type " TYPE [ TYPE_ARGS ] ]" +create a nexthop group. Group specification is id with an optional +weight (id,weight) and a '/' as a separator between entries. +.sp +.I TYPE +is a string specifying the nexthop group type. Namely: + +.in +8 +.BI mpath +- Multipath nexthop group backed by the hash-threshold algorithm. The +default when the type is unspecified. +.sp +.BI resilient +- Resilient nexthop group. Group is resilient to addition and deletion of +nexthops. + +.sp +.in -8 +.I TYPE_ARGS +is a set of attributes specific to the +.I TYPE. + +.in +8 +.B resilient +.in +2 +.B buckets +.I BUCKETS +- Number of nexthop buckets. Cannot be changed for an existing group +.sp + +.B idle_timer +.I IDLE +- Time in seconds in which a nexthop bucket does not see traffic and is +therefore considered idle. Default is 120 seconds + +.B unbalanced_timer +.I UNBALANCED +- Time in seconds in which a nexthop group is unbalanced and is therefore +considered unbalanced. The kernel will try to rebalance unbalanced groups, which +might result in some flows being reset. A value of 0 means that no +rebalancing will take place. Default is 0 seconds +.in -2 + +.TP +.B blackhole +create a blackhole nexthop +.TP +.B fdb +nexthop and nexthop groups for use with layer-2 fdb entries. +A fdb nexthop group can only have fdb nexthops. +Example: Used to represent a vxlan remote vtep ip. layer-2 vxlan +fdb entry pointing to an ecmp nexthop group containing multiple +remote vtep ips. +.RE + +.TP +ip nexthop delete id ID +delete nexthop with given id. + +.TP +ip nexthop show +show the contents of the nexthop table or the nexthops +selected by some criteria. +.RS +.TP +.BI dev " DEV " +show the nexthops using the given device. +.TP +.BI vrf " NAME " +show the nexthops using devices associated with the vrf name +.TP +.BI master " DEV " +show the nexthops using devices enslaved to given master device +.TP +.BI groups +show only nexthop groups +.TP +.BI fdb +show only fdb nexthops and nexthop groups +.RE +.TP +ip nexthop flush +flushes nexthops selected by some criteria. Criteria options are the same +as show. + +.TP +ip nexthop get id ID +get a single nexthop by id + +.TP +ip nexthop bucket show +show the contents of the nexthop bucket table or the nexthop buckets +selected by some criteria. +.RS +.TP +.BI id " ID " +.in +0 +show the nexthop buckets that belong to a nexthop group with a given id +.TP +.BI nhid " ID " +.in +0 +show the nexthop buckets that hold a nexthop with a given id +.TP +.BI dev " DEV " +.in +0 +show the nexthop buckets using the given device +.TP +.BI vrf " NAME " +.in +0 +show the nexthop buckets using devices associated with the vrf name +.TP +.BI master " DEV " +.in +0 +show the nexthop buckets using devices enslaved to given master device +.RE + +.TP +ip nexthop bucket get id ID index INDEX +get a single nexthop bucket by nexthop group id and bucket index + +.SH EXAMPLES +.PP +ip nexthop ls +.RS 4 +Show all nexthop entries in the kernel. +.RE +.PP +ip nexthop add id 1 via 192.168.1.1 dev eth0 +.RS 4 +Adds an IPv4 nexthop with id 1 using the gateway 192.168.1.1 out device eth0. +.RE +.PP +ip nexthop add id 2 encap mpls 200/300 via 10.1.1.1 dev eth0 +.RS 4 +Adds an IPv4 nexthop with mpls encapsulation attributes attached to it. +.RE +.PP +ip nexthop add id 3 group 1/2 +.RS 4 +Adds a nexthop with id 3. The nexthop is a group using nexthops with ids +1 and 2 at equal weight. +.RE +.PP +ip nexthop add id 4 group 1,5/2,11 +.RS 4 +Adds a nexthop with id 4. The nexthop is a group using nexthops with ids +1 and 2 with nexthop 1 at weight 5 and nexthop 2 at weight 11. +.RE +.PP +ip nexthop add id 5 via 192.168.1.2 fdb +.RS 4 +Adds a fdb nexthop with id 5. +.RE +.PP +ip nexthop add id 7 group 5/6 fdb +.RS 4 +Adds a fdb nexthop group with id 7. A fdb nexthop group can only have +fdb nexthops. +.RE +.PP +ip nexthop add id 10 group 1/2 type resilient buckets 32 +.RS 4 +Add a resilient nexthop group with id 10 and 32 nexthop buckets. +.RE +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by David Ahern <dsahern@kernel.org> diff --git a/man/man8/ip-ntable.8 b/man/man8/ip-ntable.8 new file mode 100644 index 0000000..4f0f2e5 --- /dev/null +++ b/man/man8/ip-ntable.8 @@ -0,0 +1,106 @@ +.TH IP\-NTABLE 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-ntable - neighbour table configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B ntable +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip ntable change name" +.IR NAME " [ " +.B dev +.IR DEV " ] [" +.B thresh1 +.IR VAL " ] [" +.B thresh2 +.IR VAL " ] [" +.B thresh3 +.IR VAL " ] [" +.B gc_int +.IR MSEC " ] [" +.B base_reachable +.IR MSEC " ] [" +.B retrans +.IR MSEC " ] [" +.B gc_stale +.IR MSEC " ] [" +.B delay_probe +.IR MSEC " ] [" +.B queue +.IR LEN " ] [" +.B app_probs +.IR VAL " ] [" +.B ucast_probes +.IR VAL " ] [" +.B mcast_probes +.IR VAL " ] [" +.B anycast_delay +.IR MSEC " ] [" +.B proxy_delay +.IR MSEC " ] [" +.B proxy_queue +.IR LEN " ] [" +.B locktime +.IR MSEC " ]" + +.ti -8 +.BR "ip ntable show" " [ " +.B dev +.IR DEV " ] [ " +.B name +.IR NAME " ]" + +.SH DESCRIPTION +.I ip ntable +controls the parameters for the neighbour tables. + +.SS ip ntable show - list the ip neighbour tables + +This commands displays neighbour table parameters and statistics. + +.TP +.BI dev " DEV" +only list the table attached to this device. + +.TP +.BI name " NAME" +only lists the table with the given name. + +.SS ip ntable change - modify table parameter + +This command allows modifying table parameters such as timers and queue lengths. +.TP +.BI name " NAME" +the name of the table to modify. + +.TP +.BI dev " DEV" +the name of the device to modify the table values. + +.SH EXAMPLES +.PP +ip ntable show dev eth0 +.RS 4 +Shows the neighbour table (IPv4 ARP and IPv6 ndisc) parameters on device eth0. +.RE +.PP +ip ntable change name arp_cache queue 8 dev eth0 +.RS 4 +Changes the number of packets queued while address is being resolved from the +default value (3) to 8 packets. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Manpage by Stephen Hemminger diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in new file mode 100644 index 0000000..10387bc --- /dev/null +++ b/man/man8/ip-route.8.in @@ -0,0 +1,1411 @@ +.TH IP\-ROUTE 8 "13 Dec 2012" "iproute2" "Linux" +.SH "NAME" +ip-route \- routing table management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " ip-OPTIONS " ]" +.B route +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 + +.ti -8 +.BR "ip route" " { " +.BR show " | " flush " } " +.I SELECTOR + +.ti -8 +.BR "ip route save" +.I SELECTOR + +.ti -8 +.BR "ip route restore" + +.ti -8 +.B ip route get +.I ROUTE_GET_FLAGS +.IR ADDRESS " [ " +.BI from " ADDRESS " iif " STRING" +.RB " ] [ " oif +.IR STRING " ] [ " +.B mark +.IR MARK " ] [ " +.B tos +.IR TOS " ] [ " +.B vrf +.IR NAME " ] [ " +.B ipproto +.IR PROTOCOL " ] [ " +.B sport +.IR NUMBER " ] [ " +.B dport +.IR NUMBER " ] " + +.ti -8 +.BR "ip route" " { " add " | " del " | " change " | " append " | "\ +replace " } " +.I ROUTE + +.ti -8 +.IR SELECTOR " := " +.RB "[ " root +.IR PREFIX " ] [ " +.B match +.IR PREFIX " ] [ " +.B exact +.IR PREFIX " ] [ " +.B table +.IR TABLE_ID " ] [ " +.B vrf +.IR NAME " ] [ " +.B proto +.IR RTPROTO " ] [ " +.B type +.IR TYPE " ] [ " +.B scope +.IR SCOPE " ]" + +.ti -8 +.IR ROUTE " := " NODE_SPEC " [ " INFO_SPEC " ]" + +.ti -8 +.IR NODE_SPEC " := [ " TYPE " ] " PREFIX " [" +.B tos +.IR TOS " ] [ " +.B table +.IR TABLE_ID " ] [ " +.B proto +.IR RTPROTO " ] [ " +.B scope +.IR SCOPE " ] [ " +.B metric +.IR METRIC " ] [ " +.B ttl-propagate +.RB "{ " enabled " | " disabled " } ]" + +.ti -8 +.IR INFO_SPEC " := { " NH " | " +.B nhid +.IR ID " } " "OPTIONS FLAGS" " [" +.B nexthop +.IR NH " ] ..." + +.ti -8 +.IR NH " := [ " +.B encap +.IR ENCAP " ] [ " +.B via +[ +.IR FAMILY " ] " ADDRESS " ] [ " +.B dev +.IR STRING " ] [ " +.B weight +.IR NUMBER " ] " NHFLAGS + +.ti -8 +.IR FAMILY " := [ " +.BR inet " | " inet6 " | " mpls " | " bridge " | " link " ]" + +.ti -8 +.IR OPTIONS " := " FLAGS " [ " +.B mtu +.IR NUMBER " ] [ " +.B advmss +.IR NUMBER " ] [ " +.B as +[ +.B to +] +.IR ADDRESS " ]" +.B rtt +.IR TIME " ] [ " +.B rttvar +.IR TIME " ] [ " +.B reordering +.IR NUMBER " ] [ " +.B window +.IR NUMBER " ] [ " +.B cwnd +.IR NUMBER " ] [ " +.B ssthresh +.IR NUMBER " ] [ " +.B realms +.IR REALM " ] [ " +.B rto_min +.IR TIME " ] [ " +.B initcwnd +.IR NUMBER " ] [ " +.B initrwnd +.IR NUMBER " ] [ " +.B features +.IR FEATURES " ] [ " +.B quickack +.IR BOOL " ] [ " +.B congctl +.IR NAME " ] [ " +.B pref +.IR PREF " ] [ " +.B expires +.IR TIME " ] [" +.B fastopen_no_cookie +.IR BOOL " ]" + +.ti -8 +.IR TYPE " := [ " +.BR unicast " | " local " | " broadcast " | " multicast " | "\ +throw " | " unreachable " | " prohibit " | " blackhole " | " nat " ]" + +.ti -8 +.IR TABLE_ID " := [ " +.BR local "| " main " | " default " | " all " |" +.IR NUMBER " ]" + +.ti -8 +.IR SCOPE " := [ " +.BR host " | " link " | " global " |" +.IR NUMBER " ]" + +.ti -8 +.IR NHFLAGS " := [ " +.BR onlink " | " pervasive " ]" + +.ti -8 +.IR RTPROTO " := [ " +.BR kernel " | " boot " | " static " |" +.IR NUMBER " ]" + +.ti -8 +.IR FEATURES " := [ " +.BR ecn " | ]" + +.ti -8 +.IR PREF " := [ " +.BR low " | " medium " | " high " ]" + +.ti -8 +.IR ENCAP " := [ " +.IR ENCAP_MPLS " | " ENCAP_IP " | " ENCAP_BPF " | " +.IR ENCAP_SEG6 " | " ENCAP_SEG6LOCAL " | " ENCAP_IOAM6 " ] " + +.ti -8 +.IR ENCAP_MPLS " := " +.BR mpls " [ " +.IR LABEL " ] [" +.B ttl +.IR TTL " ]" + +.ti -8 +.IR ENCAP_IP " := " +.B ip +.B id +.IR TUNNEL_ID +.B dst +.IR REMOTE_IP " [ " +.B src +.IR SRC " ] [" +.B tos +.IR TOS " ] [" +.B ttl +.IR TTL " ]" + +.ti -8 +.IR ENCAP_BPF " := " +.BR bpf " [ " +.B in +.IR PROG " ] [" +.B out +.IR PROG " ] [" +.B xmit +.IR PROG " ] [" +.B headroom +.IR SIZE " ]" + +.ti -8 +.IR ENCAP_SEG6 " := " +.B seg6 +.BR mode " [ " +.BR encap " | " encap.red " | " inline " | " l2encap " | " l2encap.red " ] " +.B segs +.IR SEGMENTS " [ " +.B hmac +.IR KEYID " ]" + +.ti -8 +.IR ENCAP_SEG6LOCAL " := " +.B seg6local +.BR action +.IR SEG6_ACTION " [ " +.IR SEG6_ACTION_PARAM " ] [ " +.BR count " ] " + +.ti -8 +.IR ENCAP_IOAM6 " := " +.BR ioam6 " [" +.B freq +.IR K "/" N " ] " +.BR mode " [ " +.BR inline " | " encap " | " auto " ] [" +.B tundst +.IR ADDRESS " ] " +.B trace +.B prealloc +.B type +.IR IOAM6_TRACE_TYPE +.B ns +.IR IOAM6_NAMESPACE +.B size +.IR IOAM6_TRACE_SIZE + +.ti -8 +.IR ROUTE_GET_FLAGS " := " +.BR " [ " +.BR fibmatch +.BR " ] " + +.SH DESCRIPTION +.B ip route +is used to manipulate entries in the kernel routing tables. +.sp +.B Route types: + +.in +8 +.B unicast +- the route entry describes real paths to the destinations covered +by the route prefix. + +.sp +.B unreachable +- these destinations are unreachable. Packets are discarded and the +ICMP message +.I host unreachable +is generated. +The local senders get an +.I EHOSTUNREACH +error. + +.sp +.B blackhole +- these destinations are unreachable. Packets are discarded silently. +The local senders get an +.I EINVAL +error. + +.sp +.B prohibit +- these destinations are unreachable. Packets are discarded and the +ICMP message +.I communication administratively prohibited +is generated. The local senders get an +.I EACCES +error. + +.sp +.B local +- the destinations are assigned to this host. The packets are looped +back and delivered locally. + +.sp +.B broadcast +- the destinations are broadcast addresses. The packets are sent as +link broadcasts. + +.sp +.B throw +- a special control route used together with policy rules. If such a +route is selected, lookup in this table is terminated pretending that +no route was found. Without policy routing it is equivalent to the +absence of the route in the routing table. The packets are dropped +and the ICMP message +.I net unreachable +is generated. The local senders get an +.I ENETUNREACH +error. + +.sp +.B nat +- a special NAT route. Destinations covered by the prefix +are considered to be dummy (or external) addresses which require translation +to real (or internal) ones before forwarding. The addresses to translate to +are selected with the attribute +.BR "via" . +.B Warning: +Route NAT is no longer supported in Linux 2.6. + +.sp +.B anycast +.RI "- " "not implemented" +the destinations are +.I anycast +addresses assigned to this host. They are mainly equivalent +to +.B local +with one difference: such addresses are invalid when used +as the source address of any packet. + +.sp +.B multicast +- a special type used for multicast routing. It is not present in +normal routing tables. +.in -8 + +.P +.B Route tables: +Linux-2.x can pack routes into several routing tables identified +by a number in the range from 1 to 2^32-1 or by name from +.BR @SYSCONF_USR_DIR@/rt_tables " or " @SYSCONF_ETC_DIR@/rt_tables +(has precedence if exists). +By default all normal routes are inserted into the +.B main +table (ID 254) and the kernel only uses this table when calculating routes. +Values (0, 253, 254, and 255) are reserved for built-in use. + +.sp +Actually, one other table always exists, which is invisible but +even more important. It is the +.B local +table (ID 255). This table +consists of routes for local and broadcast addresses. The kernel maintains +this table automatically and the administrator usually need not modify it +or even look at it. + +The multiple routing tables enter the game when +.I policy routing +is used. + +.TP +ip route add +add new route +.TP +ip route change +change route +.TP +ip route replace +change or add new one +.RS +.TP +.BI to " TYPE PREFIX " (default) +the destination prefix of the route. If +.I TYPE +is omitted, +.B ip +assumes type +.BR "unicast" . +Other values of +.I TYPE +are listed above. +.I PREFIX +is an IP or IPv6 address optionally followed by a slash and the +prefix length. If the length of the prefix is missing, +.B ip +assumes a full-length host route. There is also a special +.I PREFIX +.B default +- which is equivalent to IP +.B 0/0 +or to IPv6 +.BR "::/0" . + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +the Type Of Service (TOS) key. This key has no associated mask and +the longest match is understood as: First, compare the TOS +of the route and of the packet. If they are not equal, then the packet +may still match a route with a zero TOS. +.I TOS +is either an 8 bit hexadecimal number or an identifier +from +.BR @SYSCONF_USR_DIR@/rt_dsfield " or " @SYSCONF_ETC_DIR@/rt_dsfield +(has precedence if exists). + +.TP +.BI metric " NUMBER" +.TP +.BI preference " NUMBER" +the preference value of the route. +.I NUMBER +is an arbitrary 32bit number, where routes with lower values are preferred. + +.TP +.BI table " TABLEID" +the table to add this route to. +.I TABLEID +may be a number or a string from +.BR @SYSCONF_USR_DIR@/rt_tables " or " @SYSCONF_ETC_DIR@/rt_tables +(has precedence if exists). +If this parameter is omitted, +.B ip +assumes the +.B main +table, with the exception of +.BR local ", " broadcast " and " nat +routes, which are put into the +.B local +table by default. + +.TP +.BI vrf " NAME" +the vrf name to add this route to. Implicitly means the table +associated with the VRF. + +.TP +.BI dev " NAME" +the output device name. + +.TP +.BI via " [ FAMILY ] ADDRESS" +the address of the nexthop router, in the address family FAMILY. +Actually, the sense of this field depends on the route type. For +normal +.B unicast +routes it is either the true next hop router or, if it is a direct +route installed in BSD compatibility mode, it can be a local address +of the interface. For NAT routes it is the first address of the block +of translated IP destinations. + +.TP +.BI src " ADDRESS" +the source address to prefer when sending to the destinations +covered by the route prefix. + +.TP +.BI realm " REALMID" +the realm to which this route is assigned. +.I REALMID +may be a number or a string from +.BR @SYSCONF_USR_DIR@/rt_realms " or " @SYSCONF_ETC_DIR@/rt_realms +(has precedence if exists). + +.TP +.BI mtu " MTU" +.TP +.BI "mtu lock" " MTU" +the MTU along the path to the destination. If the modifier +.B lock +is not used, the MTU may be updated by the kernel due to +Path MTU Discovery. If the modifier +.B lock +is used, no path MTU discovery will be tried, all packets +will be sent without the DF bit in IPv4 case or fragmented +to MTU for IPv6. + +.TP +.BI window " NUMBER" +the maximal window for TCP to advertise to these destinations, +measured in bytes. It limits maximal data bursts that our TCP +peers are allowed to send to us. + +.TP +.BI rtt " TIME" +the initial RTT ('Round Trip Time') estimate. If no suffix is +specified the units are raw values passed directly to the +routing code to maintain compatibility with previous releases. +Otherwise if a suffix of s, sec or secs is used to specify +seconds and ms, msec or msecs to specify milliseconds. + + +.TP +.BI rttvar " TIME " "(Linux 2.3.15+ only)" +the initial RTT variance estimate. Values are specified as with +.BI rtt +above. + +.TP +.BI rto_min " TIME " "(Linux 2.6.23+ only)" +the minimum TCP Retransmission TimeOut to use when communicating with this +destination. Values are specified as with +.BI rtt +above. + +.TP +.BI ssthresh " NUMBER " "(Linux 2.3.15+ only)" +an estimate for the initial slow start threshold. + +.TP +.BI cwnd " NUMBER " "(Linux 2.3.15+ only)" +the clamp for congestion window. It is ignored if the +.B lock +flag is not used. + +.TP +.BI initcwnd " NUMBER " "(Linux 2.5.70+ only)" +the initial congestion window size for connections to this destination. +Actual window size is this value multiplied by the MSS +(``Maximal Segment Size'') for same connection. The default is +zero, meaning to use the values specified in RFC2414. + +.TP +.BI initrwnd " NUMBER " "(Linux 2.6.33+ only)" +the initial receive window size for connections to this destination. +Actual window size is this value multiplied by the MSS of the connection. +The default value is zero, meaning to use Slow Start value. + +.TP +.BI features " FEATURES " (Linux 3.18+ only) +Enable or disable per-route features. Only available feature at this +time is +.B ecn +to enable explicit congestion notification when initiating connections to the +given destination network. +When responding to a connection request from the given network, ecn will +also be used even if the +.B net.ipv4.tcp_ecn +sysctl is set to 0. + +.TP +.BI quickack " BOOL " "(Linux 3.11+ only)" +Enable or disable quick ack for connections to this destination. + +.TP +.BI fastopen_no_cookie " BOOL " "(Linux 4.15+ only)" +Enable TCP Fastopen without a cookie for connections to this destination. + +.TP +.BI congctl " NAME " "(Linux 3.20+ only)" +.TP +.BI "congctl lock" " NAME " "(Linux 3.20+ only)" +Sets a specific TCP congestion control algorithm only for a given destination. +If not specified, Linux keeps the current global default TCP congestion control +algorithm, or the one set from the application. If the modifier +.B lock +is not used, an application may nevertheless overwrite the suggested congestion +control algorithm for that destination. If the modifier +.B lock +is used, then an application is not allowed to overwrite the specified congestion +control algorithm for that destination, thus it will be enforced/guaranteed to +use the proposed algorithm. + +.TP +.BI advmss " NUMBER " "(Linux 2.3.15+ only)" +the MSS ('Maximal Segment Size') to advertise to these +destinations when establishing TCP connections. If it is not given, +Linux uses a default value calculated from the first hop device MTU. +(If the path to these destination is asymmetric, this guess may be wrong.) + +.TP +.BI reordering " NUMBER " "(Linux 2.3.15+ only)" +Maximal reordering on the path to this destination. +If it is not given, Linux uses the value selected with +.B sysctl +variable +.BR "net/ipv4/tcp_reordering" . + +.TP +.BI nexthop " NEXTHOP" +the nexthop of a multipath route. +.I NEXTHOP +is a complex value with its own syntax similar to the top level +argument lists: + +.in +8 +.BI via " [ FAMILY ] ADDRESS" +- is the nexthop router. +.sp + +.BI dev " NAME" +- is the output device. +.sp + +.BI weight " NUMBER" +- is a weight for this element of a multipath +route reflecting its relative bandwidth or quality. +.in -8 + +The internal buffer used in iproute2 limits the maximum number of nexthops that +may be specified in one go. If only +.I ADDRESS +is given, the current buffer size allows for 144 IPv6 nexthops and 253 IPv4 +ones. For IPv4, this effectively limits the number of nexthops possible per +route. With IPv6, further nexthops may be appended to the same route via +.B "ip route append" +command. + +.TP +.BI scope " SCOPE_VAL" +the scope of the destinations covered by the route prefix. +.I SCOPE_VAL +may be a number or a string from +.BR @SYSCONF_USR_DIR@/rt_scopes " or " @SYSCONF_ETC_DIR@/rt_scopes +(has precedence if exists). +If this parameter is omitted, +.B ip +assumes scope +.B global +for all gatewayed +.B unicast +routes, scope +.B link +for direct +.BR unicast " and " broadcast +routes and scope +.BR host " for " local +routes. + +.TP +.BI protocol " RTPROTO" +the routing protocol identifier of this route. +.I RTPROTO +may be a number or a string from +.BR @SYSCONF_USR_DIR@/rt_protos " or " @SYSCONF_ETC_DIR@/rt_protos +(has precedence if exists). +If the routing protocol ID is not given, +.B ip assumes protocol +.B boot +(i.e. it assumes the route was added by someone who doesn't +understand what they are doing). Several protocol values have +a fixed interpretation. +Namely: + +.in +8 +.B redirect +- the route was installed due to an ICMP redirect. +.sp + +.B kernel +- the route was installed by the kernel during autoconfiguration. +.sp + +.B boot +- the route was installed during the bootup sequence. +If a routing daemon starts, it will purge all of them. +.sp + +.B static +- the route was installed by the administrator +to override dynamic routing. Routing daemon will respect them +and, probably, even advertise them to its peers. +.sp + +.B ra +- the route was installed by Router Discovery protocol. +.in -8 + +.sp +The rest of the values are not reserved and the administrator is free +to assign (or not to assign) protocol tags. + +.TP +.B onlink +pretend that the nexthop is directly attached to this link, +even if it does not match any interface prefix. + +.TP +.BI pref " PREF" +the IPv6 route preference. +.I PREF +is a string specifying the route preference as defined in RFC4191 for Router +Discovery messages. Namely: + +.in +8 +.B low +- the route has a lowest priority +.sp + +.B medium +- the route has a default priority +.sp + +.B high +- the route has a highest priority +.sp + +.TP +.BI nhid " ID" +use nexthop object with given id as nexthop specification. +.sp +.TP +.BI encap " ENCAPTYPE ENCAPHDR" +attach tunnel encapsulation attributes to this route. +.sp +.I ENCAPTYPE +is a string specifying the supported encapsulation type. Namely: + +.in +8 +.BI mpls +- encapsulation type MPLS +.sp +.BI ip +- IP encapsulation (Geneve, GRE, VXLAN, ...) +.sp +.BI bpf +- Execution of BPF program +.sp +.BI seg6 +- encapsulation type IPv6 Segment Routing +.sp +.BI seg6local +- local SRv6 segment processing +.sp +.BI ioam6 +- encapsulation type IPv6 IOAM +.sp +.BI xfrm +- encapsulation type XFRM + +.in -8 +.I ENCAPHDR +is a set of encapsulation attributes specific to the +.I ENCAPTYPE. + +.in +8 +.B mpls +.in +2 +.I MPLSLABEL +- mpls label stack with labels separated by +.I "/" +.sp + +.B ttl +.I TTL +- TTL to use for MPLS header or 0 to inherit from IP header +.in -2 +.sp + +.B ip +.in +2 +.B id +.I TUNNEL_ID +.B dst +.IR REMOTE_IP " [ " +.B src +.IR SRC " ] [" +.B tos +.IR TOS " ] [" +.B ttl +.IR TTL " ] [ " +.BR key " ] [ " csum " ] [ " seq " ] " +.in -2 +.sp + +.B bpf +.in +2 +.B in +.I PROG +- BPF program to execute for incoming packets +.sp + +.B out +.I PROG +- BPF program to execute for outgoing packets +.sp + +.B xmit +.I PROG +- BPF program to execute for transmitted packets +.sp + +.B headroom +.I SIZE +- Size of header BPF program will attach (xmit) +.in -2 +.sp + +.B seg6 +.in +2 +.B mode inline +- Directly insert Segment Routing Header after IPv6 header +.sp + +.B mode encap +- Encapsulate packet in an outer IPv6 header with SRH +.sp + +.B mode encap.red +- Encapsulate packet in an outer IPv6 header with SRH applying the +reduced segment list. When there is only one segment and the HMAC is +not present, the SRH is omitted. +.sp + +.B mode l2encap +- Encapsulate ingress L2 frame within an outer IPv6 header and SRH +.sp + +.B mode l2encap.red +- Encapsulate ingress L2 frame within an outer IPv6 header and SRH +applying the reduced segment list. When there is only one segment +and the HMAC is not present, the SRH is omitted. +.sp + +.I SEGMENTS +- List of comma-separated IPv6 addresses +.sp + +.I KEYID +- Numerical value in decimal representation. See \fBip-sr\fR(8). +.in -2 +.sp + +.B seg6local +.in +2 +.IR SEG6_ACTION " [ " +.IR SEG6_ACTION_PARAM " ] [ " +.BR count " ] " +- Operation to perform on matching packets. The optional \fBcount\fR +attribute is used to collect statistics on the processing of actions. +Three counters are implemented: 1) packets correctly processed; +2) bytes correctly processed; 3) packets that cause a processing error +(i.e., missing SID List, wrong SID List, etc). To retrieve the counters +related to an action use the \fB-s\fR flag in the \fBshow\fR command. +The following actions are currently supported (\fBLinux 4.14+ only\fR). +.in +2 + +.BR End " [ " flavors +.IR FLAVORS " ] " +- Regular SRv6 processing as intermediate segment endpoint. +This action only accepts packets with a non-zero Segments Left +value. Other matching packets are dropped. The presence of flavors +can change the regular processing of an End behavior according to +the user-provided Flavor operations and information carried in the packet. +See \fBFlavors parameters\fR section. + +.B End.X nh6 +.I NEXTHOP +.RB [ " flavors " +.IR FLAVORS " ] " +- Regular SRv6 processing as intermediate segment endpoint. +Additionally, forward processed packets to given next-hop. +This action only accepts packets with a non-zero Segments Left +value. Other matching packets are dropped. The presence of flavors +can change the regular processing of an End.X behavior according to +the user-provided Flavor operations and information carried in the packet. +See \fBFlavors parameters\fR section. + + +.B End.DX6 nh6 +.I NEXTHOP +- Decapsulate inner IPv6 packet and forward it to the +specified next-hop. If the argument is set to ::, then +the next-hop is selected according to the local selection +rules. This action only accepts packets with either a zero Segments +Left value or no SRH at all, and an inner IPv6 packet. Other +matching packets are dropped. + +.BR End.DT6 " { " table " | " vrftable " } " +.I TABLEID +- Decapsulate the inner IPv6 packet and forward it according to the +specified lookup table. +.I TABLEID +is either a number or a string from +.BR @SYSCONF_USR_DIR@/rt_tables " or " @SYSCONF_ETC_DIR@/rt_tables +(has precedence if exists). +If +.B vrftable +is used, the argument must be a VRF device associated with +the table id. Moreover, the VRF table associated with the +table id must be configured with the VRF strict mode turned +on (net.vrf.strict_mode=1). This action only accepts packets +with either a zero Segments Left value or no SRH at all, +and an inner IPv6 packet. Other matching packets are dropped. + +.B End.DT4 vrftable +.I TABLEID +- Decapsulate the inner IPv4 packet and forward it according to the +specified lookup table. +.I TABLEID +is either a number or a string from +.BR @SYSCONF_USR_DIR@/rt_tables " or " @SYSCONF_ETC_DIR@/rt_tables +(has precedence if exists). +The argument must be a VRF device associated with the table id. +Moreover, the VRF table associated with the table id must be configured +with the VRF strict mode turned on (net.vrf.strict_mode=1). This action +only accepts packets with either a zero Segments Left value or no SRH +at all, and an inner IPv4 packet. Other matching packets are dropped. + +.B End.DT46 vrftable +.I TABLEID +- Decapsulate the inner IPv4 or IPv6 packet and forward it according +to the specified lookup table. +.I TABLEID +is either a number or a string from +.BR @SYSCONF_USR_DIR@/rt_tables " or " @SYSCONF_ETC_DIR@/rt_tables +(has precedence if exists). +The argument must be a VRF device associated with the table id. +Moreover, the VRF table associated with the table id must be configured +with the VRF strict mode turned on (net.vrf.strict_mode=1). This action +only accepts packets with either a zero Segments Left value or no SRH +at all, and an inner IPv4 or IPv6 packet. Other matching packets are +dropped. + +.B End.B6 srh segs +.IR SEGMENTS " [ " +.B hmac +.IR KEYID " ] " +- Insert the specified SRH immediately after the IPv6 header, +update the DA with the first segment of the newly inserted SRH, +then forward the resulting packet. The original SRH is not +modified. This action only accepts packets with a non-zero +Segments Left value. Other matching packets are dropped. + +.B End.B6.Encaps srh segs +.IR SEGMENTS " [ " +.B hmac +.IR KEYID " ] " +- Regular SRv6 processing as intermediate segment endpoint. +Additionally, encapsulate the matching packet within an outer IPv6 header +followed by the specified SRH. The destination address of the outer IPv6 +header is set to the first segment of the new SRH. The source +address is set as described in \fBip-sr\fR(8). + +.B Flavors parameters + +The flavors represent additional operations that can modify or extend a +subset of the existing behaviors. +.in +2 + +.B flavors +.IR OPERATION "[," OPERATION "] [" ATTRIBUTES "]" +.in +2 + +.IR OPERATION " := { " +.BR psp " | " +.BR usp " | " +.BR usd " | " +.BR next-csid " }" + +.IR ATTRIBUTES " := {" +.IR "KEY VALUE" " } [" +.IR ATTRIBUTES " ]" + +.IR KEY " := { " +.BR lblen " | " +.BR nflen " } " +.in -2 + +.B psp +- The Penultimate Segment Pop (PSP) copies the last SID from the SID List +(carried by the outermost SRH) into the IPv6 Destination Address (DA) and +removes (i.e. pops) the SRH from the IPv6 header. +The PSP operation takes place only at a penultimate SR Segment Endpoint node +(e.g., the Segment Left must be one) and does not happen at non-penultimate +endpoint nodes. This flavor is currently only supported by End behavior. + +.B usp +- Ultimate Segment Pop of the SRH (not yet supported in kernel) + +.B usd +- Ultimate Segment Decapsulation (not yet supported in kernel) + +.B next-csid +- The NEXT-C-SID mechanism offers the possibility of encoding +several SRv6 segments within a single 128 bit SID address. The NEXT-C-SID +flavor can be configured to support user-provided Locator-Block and +Locator-Node Function lengths. If Locator-Block and/or Locator-Node Function +lengths are not provided by the user during configuration of an SRv6 End +behavior instance with NEXT-C-SID flavor, the default value is 32-bit for +Locator-Block and 16-bit for Locator-Node Function. + +.BI lblen " VALUE " +- defines the Locator-Block length for NEXT-C-SID flavor. +The Locator-Block length must be greater than 0 and evenly divisible by 8. This +attribute can be used only with NEXT-C-SID flavor. + +.BI nflen " VALUE " +- defines the Locator-Node Function length for NEXT-C-SID +flavors. The Locator-Node Function length must be greater than 0 and evenly +divisible by 8. This attribute can be used only with NEXT-C-SID flavor. +.in -4 + +.B ioam6 +.in +2 +.B freq K/N +- Inject IOAM in K packets every N packets (default is 1/1). + +.B mode inline +- Directly insert IOAM after IPv6 header (default mode). +.sp + +.B mode encap +- Encapsulate packet in an outer IPv6 header with IOAM. +.sp + +.B mode auto +- Automatically use inline mode for local packets and encap mode for in-transit +packets. +.sp + +.B tundst +.I ADDRESS +- IPv6 address of the tunnel destination (outer header), not used with inline +mode. + +.B type +.I IOAM6_TRACE_TYPE +- List of IOAM data required in the trace, represented by a bitfield (24 bits). +.sp + +.B ns +.I IOAM6_NAMESPACE +- Numerical value to represent an IOAM namespace. See \fBip-ioam\fR(8). +.sp + +.B size +.I IOAM6_TRACE_SIZE +- Size, in octets, of the pre-allocated trace data block. +.in -2 + +.B xfrm +.in +2 +.B if_id +.I IF_ID +.B " [ link_dev +.IR LINK_DEV " ] " +.in -4 + +.in -8 + +.TP +.BI expires " TIME " "(Linux 4.4+ only)" +the route will be deleted after the expires time. +.B Only +support IPv6 at present. + +.TP +.BR ttl-propagate " { " enabled " | " disabled " } " +Control whether TTL should be propagated from any encap into the +un-encapsulated packet, overriding any global configuration. Only +supported for MPLS at present. +.RE + +.TP +ip route delete +delete route +.RS +.B ip route del +has the same arguments as +.BR "ip route add" , +but their semantics are a bit different. + +Key values +.RB "(" to ", " tos ", " preference " and " table ")" +select the route to delete. If optional attributes are present, +.B ip +verifies that they coincide with the attributes of the route to delete. +If no route with the given key and attributes was found, +.B ip route del +fails. +.RE + +.TP +ip route show +list routes +.RS +the command displays the contents of the routing tables or the route(s) +selected by some criteria. + +.TP +.BI to " SELECTOR " (default) +only select routes from the given range of destinations. +.I SELECTOR +consists of an optional modifier +.RB "(" root ", " match " or " exact ")" +and a prefix. +.BI root " PREFIX" +selects routes with prefixes not shorter than +.IR PREFIX "." +F.e. +.BI root " 0/0" +selects the entire routing table. +.BI match " PREFIX" +selects routes with prefixes not longer than +.IR PREFIX "." +F.e. +.BI match " 10.0/16" +selects +.IR 10.0/16 "," +.IR 10/8 " and " 0/0 , +but it does not select +.IR 10.1/16 " and " 10.0.0/24 . +And +.BI exact " PREFIX" +(or just +.IR PREFIX ")" +selects routes with this exact prefix. If neither of these options +are present, +.B ip +assumes +.BI root " 0/0" +i.e. it lists the entire table. + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +only select routes with the given TOS. + +.TP +.BI table " TABLEID" +show the routes from this table(s). The default setting is to show table +.BR main "." +.I TABLEID +may either be the ID of a real table or one of the special values: +.sp +.in +8 +.B all +- list all of the tables. +.sp +.B cache +- dump the routing cache. +.in -8 + +.TP +.BI vrf " NAME" +show the routes for the table associated with the vrf name + +.TP +.B cloned +.TP +.B cached +list cloned routes i.e. routes which were dynamically forked from +other routes because some route attribute (f.e. MTU) was updated. +Actually, it is equivalent to +.BR "table cache" "." + +.TP +.BI from " SELECTOR" +the same syntax as for +.BR to "," +but it binds the source address range rather than destinations. +Note that the +.B from +option only works with cloned routes. + +.TP +.BI protocol " RTPROTO" +only list routes of this protocol. + +.TP +.BI scope " SCOPE_VAL" +only list routes with this scope. + +.TP +.BI type " TYPE" +only list routes of this type. + +.TP +.BI dev " NAME" +only list routes going via this device. + +.TP +.BI via " [ FAMILY ] PREFIX" +only list routes going via the nexthop routers selected by +.IR PREFIX "." + +.TP +.BI src " PREFIX" +only list routes with preferred source addresses selected +by +.IR PREFIX "." + +.TP +.BI realm " REALMID" +.TP +.BI realms " FROMREALM/TOREALM" +only list routes with these realms. +.RE + +.TP +ip route flush +flush routing tables +.RS +this command flushes routes selected by some criteria. + +.sp +The arguments have the same syntax and semantics as the arguments of +.BR "ip route show" , +but routing tables are not listed but purged. The only difference is +the default action: +.B show +dumps all the IP main routing table but +.B flush +prints the helper page. + +.sp +With the +.B -statistics +option, the command becomes verbose. It prints out the number of +deleted routes and the number of rounds made to flush the routing +table. If the option is given +twice, +.B ip route flush +also dumps all the deleted routes in the format described in the +previous subsection. +.RE + +.TP +ip route get +get a single route +.RS +this command gets a single route to a destination and prints its +contents exactly as the kernel sees it. + +.TP +.BI fibmatch +Return full fib lookup matched route. Default is to return the resolved +dst entry + +.TP +.BI to " ADDRESS " (default) +the destination address. + +.TP +.BI from " ADDRESS" +the source address. + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +the Type Of Service. + +.TP +.BI iif " NAME" +the device from which this packet is expected to arrive. + +.TP +.BI oif " NAME" +force the output device on which this packet will be routed. + +.TP +.BI mark " MARK" +the firewall mark +.RB ( "fwmark" ) + +.TP +.BI vrf " NAME" +force the vrf device on which this packet will be routed. + +.TP +.BI ipproto " PROTOCOL" +ip protocol as seen by the route lookup + +.TP +.BI sport " NUMBER" +source port as seen by the route lookup + +.TP +.BI dport " NUMBER" +destination port as seen by the route lookup + +.TP +.B connected +if no source address +.RB "(option " from ")" +was given, relookup the route with the source set to the preferred +address received from the first lookup. +If policy routing is used, it may be a different route. + +.P +Note that this operation is not equivalent to +.BR "ip route show" . +.B show +shows existing routes. +.B get +resolves them and creates new clones if necessary. Essentially, +.B get +is equivalent to sending a packet along this path. +If the +.B iif +argument is not given, the kernel creates a route +to output packets towards the requested destination. +This is equivalent to pinging the destination +with a subsequent +.BR "ip route ls cache" , +however, no packets are actually sent. With the +.B iif +argument, the kernel pretends that a packet arrived from this interface +and searches for a path to forward the packet. +.RE + +.TP +ip route save +save routing table information to stdout +.RS +This command behaves like +.BR "ip route show" +except that the output is raw data suitable for passing to +.BR "ip route restore" . +.RE + +.TP +ip route restore +restore routing table information from stdin +.RS +This command expects to read a data stream as returned from +.BR "ip route save" . +It will attempt to restore the routing table information exactly as +it was at the time of the save, so any translation of information +in the stream (such as device indexes) must be done first. Any existing +routes are left unchanged. Any routes specified in the data stream that +already exist in the table will be ignored. +.RE + +.SH NOTES +Starting with Linux kernel version 3.6, there is no routing cache for IPv4 +anymore. Hence +.B "ip route show cached" +will never print any entries on systems with this or newer kernel versions. + +.SH EXAMPLES +.PP +ip ro +.RS 4 +Show all route entries in the kernel. +.RE +.PP +ip route add default via 192.168.1.1 dev eth0 +.RS 4 +Adds a default route (for all addresses) via the local gateway 192.168.1.1 that can +be reached on device eth0. +.RE +.PP +ip route add 10.1.1.0/30 encap mpls 200/300 via 10.1.1.1 dev eth0 +.RS 4 +Adds an ipv4 route with mpls encapsulation attributes attached to it. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap seg6 mode encap segs 2001:db8:42::1,2001:db8:ffff::2 dev eth0 +.RS 4 +Adds an IPv6 route with SRv6 encapsulation and two segments attached. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap seg6local action End.DT46 vrftable 100 dev vrf100 +.RS 4 +Adds an IPv6 route with SRv6 decapsulation and forward with lookup in VRF table. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap seg6local action End flavors psp dev eth0 +.RS 4 +Adds an IPv6 route with SRv6 End behavior with psp flavor enabled. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap seg6local action End flavors next-csid dev eth0 +.RS 4 +Adds an IPv6 route with SRv6 End behavior with next-csid flavor enabled. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap seg6local action End flavors next-csid lblen 48 nflen 16 dev eth0 +.RS 4 +Adds an IPv6 route with SRv6 End behavior with next-csid flavor enabled and user-provided Locator-Block and Locator-Node Function lengths. +.RE +.PP +ip -6 route add 2001:db8:1::/64 encap ioam6 freq 2/5 mode encap tundst 2001:db8:42::1 trace prealloc type 0x800000 ns 1 size 12 dev eth0 +.RS 4 +Adds an IPv6 route with an IOAM Pre-allocated Trace encapsulation (ip6ip6) that only includes the hop limit and the node id, configured for the IOAM namespace 1 and a pre-allocated data block of 12 octets (will be injected in 2 packets every 5 packets). +.RE +.PP +ip route add 10.1.1.0/30 nhid 10 +.RS 4 +Adds an ipv4 route using nexthop object with id 10. +.RE +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-rule.8 b/man/man8/ip-rule.8 new file mode 100644 index 0000000..d10b8b2 --- /dev/null +++ b/man/man8/ip-rule.8 @@ -0,0 +1,358 @@ +.TH IP\-RULE 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-rule \- routing policy database management +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B rule +.RI "{ " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B ip rule +.RB "[ " show +.RI "[ " SELECTOR " ]]" + +.ti -8 +.B ip rule +.RB "{ " add " | " del " }" +.I SELECTOR ACTION + +.ti -8 +.B ip rule +.RB "{ " flush " | " save " | " restore " }" + +.ti -8 +.IR SELECTOR " := [ " +.BR not " ] [" +.B from +.IR PREFIX " ] [ " +.B to +.IR PREFIX " ] [ " +.B tos +.IR TOS " ] [ " +.B fwmark +.IR FWMARK\fR[\fB/\fIMASK "] ] [ " +.B iif +.IR STRING " ] [ " +.B oif +.IR STRING " ] [ " +.B priority +.IR PREFERENCE " ] [ " +.IR l3mdev " ] [ " +.B uidrange +.IR NUMBER "-" NUMBER " ] [ " +.B ipproto +.IR PROTOCOL " ] [ " +.BR sport " [ " +.IR NUMBER " | " +.IR NUMBER "-" NUMBER " ] ] [ " +.BR dport " [ " +.IR NUMBER " | " +.IR NUMBER "-" NUMBER " ] ] [ " +.B tun_id +.IR TUN_ID " ]" +.BR + + +.ti -8 +.IR ACTION " := [ " +.B table +.IR TABLE_ID " ] [ " +.B protocol +.IR PROTO " ] [ " +.B nat +.IR ADDRESS " ] [ " +.B realms +.RI "[" SRCREALM "\fB/\fR]" DSTREALM " ] [" +.B goto +.IR NUMBER " ] " SUPPRESSOR + +.ti -8 +.IR SUPPRESSOR " := [ " +.B suppress_prefixlength +.IR NUMBER " ] [ " +.B suppress_ifgroup +.IR GROUP " ]" + +.ti -8 +.IR TABLE_ID " := [ " +.BR local " | " main " | " default " |" +.IR NUMBER " ]" + +.SH DESCRIPTION +.I ip rule +manipulates rules +in the routing policy database that controls the route selection algorithm. + +.P +Classic routing algorithms used in the Internet make routing decisions +based only on the destination address of packets (and in theory, +but not in practice, on the TOS field). + +.P +In some circumstances, we want to route packets differently depending not only +on destination addresses but also on other packet fields: source address, +IP protocol, transport protocol ports or even packet payload. +This task is called 'policy routing'. + +.P +To solve this task, the conventional destination based routing table, ordered +according to the longest match rule, is replaced with a 'routing policy +database' (or RPDB), which selects routes by executing some set of rules. + +.P +Each policy routing rule consists of a +.B selector +and an +.B action predicate. +The RPDB is scanned in order of decreasing priority (note that a lower number +means higher priority, see the description of +.I PREFERENCE +below). The selector +of each rule is applied to {source address, destination address, incoming +interface, tos, fwmark} and, if the selector matches the packet, +the action is performed. The action predicate may return with success. +In this case, it will either give a route or failure indication +and the RPDB lookup is terminated. Otherwise, the RPDB program +continues with the next rule. + +.P +Semantically, the natural action is to select the nexthop and the output device. + +.P +At startup time the kernel configures the default RPDB consisting of three +rules: + +.TP +1. +Priority: 0, Selector: match anything, Action: lookup routing +table +.B local +(ID 255). +The +.B local +table is a special routing table containing +high priority control routes for local and broadcast addresses. + +.TP +2. +Priority: 32766, Selector: match anything, Action: lookup routing +table +.B main +(ID 254). +The +.B main +table is the normal routing table containing all non-policy +routes. This rule may be deleted and/or overridden with other +ones by the administrator. + +.TP +3. +Priority: 32767, Selector: match anything, Action: lookup routing +table +.B default +(ID 253). +The +.B default +table is empty. It is reserved for some post-processing if no previous +default rules selected the packet. +This rule may also be deleted. + +.P +Each RPDB entry has additional +attributes. F.e. each rule has a pointer to some routing +table. NAT and masquerading rules have an attribute to select new IP +address to translate/masquerade. Besides that, rules have some +optional attributes, which routes have, namely +.BR "realms" . +These values do not override those contained in the routing tables. They +are only used if the route did not select any attributes. + +.sp +The RPDB may contain rules of the following types: + +.RS +.B unicast +- the rule returns the route found +in the routing table referenced by the rule. + +.B blackhole +- the rule causes a silent drop the packet. + +.B unreachable +- the rule generates a 'Network is unreachable' error. + +.B prohibit +- the rule generates 'Communication is administratively +prohibited' error. + +.B nat +- the rule translates the source address +of the IP packet into some other value. +.RE + +.TP +.B ip rule add - insert a new rule +.TP +.B ip rule delete - delete a rule +.RS +.TP +.BI type " TYPE " (default) +the type of this rule. The list of valid types was given in the previous +subsection. + +.TP +.BI from " PREFIX" +select the source prefix to match. + +.TP +.BI to " PREFIX" +select the destination prefix to match. + +.TP +.BI iif " NAME" +select the incoming device to match. If the interface is loopback, +the rule only matches packets originating from this host. This means +that you may create separate routing tables for forwarded and local +packets and, hence, completely segregate them. + +.TP +.BI oif " NAME" +select the outgoing device to match. The outgoing interface is only +available for packets originating from local sockets that are bound to +a device. + +.TP +.BI tos " TOS" +.TP +.BI dsfield " TOS" +select the TOS value to match. + +.TP +.BI fwmark " MARK" +select the +.B fwmark +value to match. + +.TP +.BI uidrange " NUMBER-NUMBER" +select the +.B uid +value to match. + +.TP +.BI ipproto " PROTOCOL" +select the ip protocol value to match. + +.TP +.BI sport " NUMBER | NUMBER-NUMBER" +select the source port value to match. supports port range. + +.TP +.BI dport " NUMBER | NUMBER-NUMBER" +select the destination port value to match. supports port range. + +.TP +.BI priority " PREFERENCE" +the priority of this rule. +.I PREFERENCE +is an unsigned integer value, higher number means lower priority, and rules get +processed in order of increasing number. Each rule +should have an explicitly set +.I unique +priority value. +The options preference and order are synonyms with priority. + +.TP +.BI table " TABLEID" +the routing table identifier to lookup if the rule selector matches. +It is also possible to use lookup instead of table. + +.TP +.BI protocol " PROTO" +the routing protocol who installed the rule in question. As an example when zebra installs a rule it would get RTPROT_ZEBRA as the installing protocol. + +.TP +.BI suppress_prefixlength " NUMBER" +reject routing decisions that have a prefix length of NUMBER or less. + +.TP +.BI suppress_ifgroup " GROUP" +reject routing decisions that use a device belonging to the interface +group GROUP. + +.TP +.BI realms " FROM/TO" +Realms to select if the rule matched and the routing table lookup +succeeded. Realm +.I TO +is only used if the route did not select any realm. + +.TP +.BI nat " ADDRESS" +The base of the IP address block to translate (for source addresses). +The +.I ADDRESS +may be either the start of the block of NAT addresses (selected by NAT +routes) or a local host address (or even zero). +In the last case the router does not translate the packets, but +masquerades them to this address. +Using map-to instead of nat means the same thing. + +.B Warning: +Changes to the RPDB made with these commands do not become active +immediately. It is assumed that after a script finishes a batch of +updates, it flushes the routing cache with +.BR "ip route flush cache" . +.RE +.TP +.B ip rule flush - also dumps all the deleted rules. +.RS +.TP +.BI protocol " PROTO" +Select the originating protocol. +.RE +.TP +.B ip rule show - list rules +This command has no arguments. +The options list or lst are synonyms with show. + +.TP +.B ip rule save +.RS +.TP +.BI protocol " PROTO" +Select the originating protocol. +.RE +.TP +save rules table information to stdout +.RS +This command behaves like +.BR "ip rule show" +except that the output is raw data suitable for passing to +.BR "ip rule restore" . +.RE + +.TP +.B ip rule restore +restore rules table information from stdin +.RS +This command expects to read a data stream as returned from +.BR "ip rule save" . +It will attempt to restore the rules table information exactly as +it was at the time of the save. Any rules already in the table are +left unchanged, and duplicates are not ignored. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-sr.8 b/man/man8/ip-sr.8 new file mode 100644 index 0000000..6be1cc5 --- /dev/null +++ b/man/man8/ip-sr.8 @@ -0,0 +1,58 @@ +.TH IP\-SR 8 "14 Apr 2017" "iproute2" "Linux" +.SH "NAME" +ip-sr \- IPv6 Segment Routing management +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B ip sr +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 + +.ti -8 +.B ip sr hmac show + +.ti -8 +.B ip sr hmac set +.I KEYID ALGO + +.ti -8 +.B ip sr tunsrc show + +.ti -8 +.B ip sr tunsrc set +.I ADDRESS + +.SH DESCRIPTION +The \fBip sr\fR command is used to configure IPv6 Segment Routing (SRv6) +internal parameters. +.PP +Those parameters include the mapping between an HMAC key ID and its associated +hashing algorithm and secret, and the IPv6 address to use as source for encapsulated +packets. +.PP +The \fBip sr hmac set\fR command prompts for a passphrase that will be used as the +HMAC secret for the corresponding key ID. A blank passphrase removes the mapping. +The currently supported algorithms for \fIALGO\fR are \fBsha1\fR and \fBsha256\fR. +.PP +If the tunnel source is set to the address :: (which is the default), then an address +of the egress interface will be selected. As this operation may hinder performances, +it is recommended to set a non-default address. + +.SH EXAMPLES +.PP +.SS Configure an HMAC mapping for key ID 42 and hashing algorithm SHA-256 +.nf +# ip sr hmac set 42 sha256 +.PP +.SS Set the tunnel source address to 2001:db8::1 +.nf +# ip sr tunsrc set 2001:db8::1 +.SH SEE ALSO +.br +.BR ip-route (8) +.SH AUTHOR +David Lebrun <david.lebrun@uclouvain.be> diff --git a/man/man8/ip-stats.8 b/man/man8/ip-stats.8 new file mode 100644 index 0000000..2633645 --- /dev/null +++ b/man/man8/ip-stats.8 @@ -0,0 +1,208 @@ +.TH IP\-STATS 8 "16 Mar 2022" "iproute2" "Linux" +.SH NAME +ip-stats \- manage and show interface statistics +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B ip +.B stats +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip stats show" +.RB "[ " dev +.IR DEV " ] " +.RB "[ " group +.IR GROUP " [ " +.BI subgroup " SUBGROUP" +.RB " [ " suite +.IR " SUITE" " ] ... ] ... ] ..." + +.ti -8 +.BR "ip stats set" +.BI dev " DEV" +.BR l3_stats " { " +.BR on " | " off " }" + +.SH DESCRIPTION + +.TP +.B ip stats set +is used for toggling whether a certain HW statistics suite is collected on +a given netdevice. The following statistics suites are supported: + +.in 21 + +.ti 14 +.B l3_stats +L3 stats reflect traffic that takes place in a HW device on an object that +corresponds to the given software netdevice. + +.TP +.B ip stats show +is used for showing stats on a given netdevice, or dumping statistics +across all netdevices. By default, all stats are requested. It is possible +to filter which stats are requested by using the +.B group +and +.B subgroup +keywords. + +It is possible to specify several groups, or several subgroups for one +group. When no subgroups are given for a group, all the subgroups are +requested. + +The following groups are recognized: +.in 21 + +.ti 14 +.B group link +- Link statistics. The same suite that "ip -s link show" shows. + +.ti 14 +.B group offload +- A group that contains a number of HW-oriented statistics. See below for +individual subgroups within this group. + +.ti 14 +.B group xstats +- Extended statistics. A subgroup identifies the type of netdevice to show the +statistics for. + +.ti 14 +.B group xstats_slave +- Extended statistics for the slave of a netdevice of a given type. A subgroup +identifies the type of master netdevice. + +.ti 14 +.B group afstats +- A group for address-family specific netdevice statistics. + +.TQ +.BR "group offload " subgroups: +.in 21 + +.ti 14 +.B subgroup cpu_hit +- The +.B cpu_hit +statistics suite is useful on hardware netdevices. The +.B link +statistics on these devices reflect both the hardware- and +software-datapath traffic. The +.B cpu_hit +statistics then only reflect software-datapath traffic. + +.ti 14 +.B subgroup hw_stats_info +- This suite does not include traffic statistics, but rather communicates +the state of other statistics. Through this subgroup, it is possible to +discover whether a given statistic was enabled, and when it was, whether +any device driver actually configured its device to collect these +statistics. For example, +.B l3_stats +was enabled in the following case, but no driver has installed it: + +# ip stats show dev swp1 group offload subgroup hw_stats_info +.br +56: swp1: group offload subgroup hw_stats_info +.br + l3_stats on used off + +After an L3 address is added to the netdevice, the counter will be +installed: + +# ip addr add dev swp1 192.0.2.1/28 +.br +# ip stats show dev swp1 group offload subgroup hw_stats_info +.br +56: swp1: group offload subgroup hw_stats_info +.br + l3_stats on used on + +.ti 14 +.B subgroup l3_stats +- These statistics reflect L3 traffic that takes place in HW on an object +that corresponds to the netdevice. Note that this suite is disabled by +default and needs to be first enabled through +.B ip stats set\fR. + +For example: + +# ip stats show dev swp2.200 group offload subgroup l3_stats +.br +112: swp2.200: group offload subgroup l3_stats on used on +.br + RX: bytes packets errors dropped mcast +.br + 8900 72 2 0 3 +.br + TX: bytes packets errors dropped +.br + 7176 58 0 0 + +Note how the l3_stats_info for the selected group is also part of the dump. + +.TQ +.BR "group xstats " and " group xstats_slave " subgroups: +.in 21 + +.ti 14 +.B subgroup bridge \fR[\fB suite stp \fR] [\fB suite mcast \fR] +- Statistics for STP and, respectively, IGMP / MLD (under the keyword +\fBmcast\fR) traffic on bridges and their slaves. + +.ti 14 +.B subgroup bond \fR[\fB suite 802.3ad \fR] +- Statistics for LACP traffic on bond devices and their slaves. + +.TQ +.BR "group afstats " subgroups: +.in 21 + +.ti 14 +.B subgroup mpls +- Statistics for MPLS traffic seen on the netdevice. For example: + +# ip stats show dev veth01 group afstats subgroup mpls +.br +3: veth01: group afstats subgroup mpls +.br + RX: bytes packets errors dropped noroute +.br + 0 0 0 0 0 +.br + TX: bytes packets errors dropped +.br + 216 2 0 0 + +.SH EXAMPLES +.PP +# ip stats set dev swp1 l3_stats on +.RS +Enables collection of L3 HW statistics on swp1. +.RE + +.PP +# ip stats show group offload +.RS +Shows all offload statistics on all netdevices. +.RE + +.PP +# ip stats show dev swp1 group link +.RS +Shows link statistics on the given netdevice. +.RE + +.SH SEE ALSO +.br +.BR ip (8), +.BR ip-link (8), + +.SH AUTHOR +Manpage by Petr Machata. diff --git a/man/man8/ip-tcp_metrics.8 b/man/man8/ip-tcp_metrics.8 new file mode 100644 index 0000000..5d2dac8 --- /dev/null +++ b/man/man8/ip-tcp_metrics.8 @@ -0,0 +1,143 @@ +.TH "IP\-TCP_METRICS" 8 "23 Aug 2012" "iproute2" "Linux" +.SH "NAME" +ip-tcp_metrics \- management for TCP Metrics +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B tcp_metrics +.RI "{ " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip tcp_metrics" " { " show " | " flush " } +.IR SELECTOR + +.ti -8 +.BR "ip tcp_metrics delete " [ " address " ] +.IR ADDRESS + +.ti -8 +.IR SELECTOR " := " +.RB "[ [ " address " ] " +.IR PREFIX " ]" + +.SH "DESCRIPTION" +.B ip tcp_metrics +is used to manipulate entries in the kernel that keep TCP information +for IPv4 and IPv6 destinations. The entries are created when +TCP sockets want to share information for destinations and are +stored in a cache keyed by the destination address. The saved +information may include values for metrics (initially obtained from +routes), recent TSVAL for TIME-WAIT recycling purposes, state for the +Fast Open feature, etc. +For performance reasons the cache can not grow above configured limit +and the older entries are replaced with fresh information, sometimes +reclaimed and used for new destinations. The kernel never removes +entries, they can be flushed only with this tool. + +.SS ip tcp_metrics show - show cached entries + +.TP +.BI address " PREFIX " (default) +IPv4/IPv6 prefix or address. If no prefix is provided all entries are shown. + +.LP +The output may contain the following information: + +.BI age " <S.MMM>" sec +- time after the entry was created, reset or updated with metrics +from sockets. The entry is reset and refreshed on use with metrics from +route if the metrics are not updated in last hour. Not all cached values +reset the age on update. + +.BI cwnd " <N>" +- CWND metric value + +.BI fo_cookie " <HEX-STRING>" +- Cookie value received in SYN-ACK to be used by Fast Open for next SYNs + +.BI fo_mss " <N>" +- MSS value received in SYN-ACK to be used by Fast Open for next SYNs + +.BI fo_syn_drops " <N>/<S.MMM>" "sec ago" +- Number of drops of initial outgoing Fast Open SYNs with data +detected by monitoring the received SYN-ACK after SYN retransmission. +The seconds show the time after last SYN drop and together with +the drop count can be used to disable Fast Open for some time. + +.BI reordering " <N>" +- Reordering metric value + +.BI rtt " <N>" us +- RTT metric value + +.BI rttvar " <N>" us +- RTTVAR metric value + +.BI ssthresh " <SSTHRESH>" +- SSTHRESH metric value + +.BI tw_ts " <TSVAL>/<SEC>" "sec ago" +- recent TSVAL and the seconds after saving it into TIME-WAIT socket + +.SS ip tcp_metrics delete - delete single entry + +.TP +.BI address " ADDRESS " (default) +IPv4/IPv6 address. The address is a required argument. + +.SS ip tcp_metrics flush - flush entries +This command flushes the entries selected by some criteria. + +.PP +This command has the same arguments as +.B show. + +.SH "EXAMPLES" +.PP +ip tcp_metrics show address 192.168.0.0/24 +.RS 4 +Shows the entries for destinations from subnet +.RE +.PP +ip tcp_metrics show 192.168.0.0/24 +.RS 4 +The same but address keyword is optional +.RE +.PP +ip tcp_metrics +.RS 4 +Show all is the default action +.RE +.PP +ip tcp_metrics delete 192.168.0.1 +.RS 4 +Removes the entry for 192.168.0.1 from cache. +.RE +.PP +ip tcp_metrics flush 192.168.0.0/24 +.RS 4 +Removes entries for destinations from subnet +.RE +.PP +ip tcp_metrics flush all +.RS 4 +Removes all entries from cache +.RE +.PP +ip -6 tcp_metrics flush all +.RS 4 +Removes all IPv6 entries from cache keeping the IPv4 entries. +.RE + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Julian Anastasov <ja@ssi.bg> diff --git a/man/man8/ip-token.8 b/man/man8/ip-token.8 new file mode 100644 index 0000000..6505b8c --- /dev/null +++ b/man/man8/ip-token.8 @@ -0,0 +1,75 @@ +.TH IP\-TOKEN 8 "28 Mar 2013" "iproute2" "Linux" +.SH "NAME" +ip-token \- tokenized interface identifier support +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip token +.RI "{ " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B ip token set +.IR TOKEN +.B dev +.IR DEV + +.ti -8 +.B ip token del dev +.IR DEV + +.ti -8 +.B ip token get +.RB "[ " dev +.IR DEV " ]" + +.ti -8 +.BR "ip token" " [ " list " ]" + +.SH "DESCRIPTION" +IPv6 tokenized interface identifier support is used for assigning well-known +host-part addresses to nodes whilst still obtaining a global network prefix +from Router advertisements. The primary target for tokenized identifiers are +server platforms where addresses are usually manually configured, rather than +using DHCPv6 or SLAAC. By using tokenized identifiers, hosts can still +determine their network prefix by use of SLAAC, but more readily be +automatically renumbered should their network prefix change [1]. Tokenized +IPv6 Identifiers are described in the draft +[1]: <draft-chown-6man-tokenised-ipv6-identifiers-02>. + +.SS ip token set - set an interface token +set the interface token to the kernel. +.TP +.I TOKEN +the interface identifier token address. +.TP +.BI dev " DEV" +the networking interface. + +.SS ip token del - delete an interface token +delete the interface token from the kernel. +.TP +.BI dev " DEV" +the networking interface. + +.SS ip token get - get the interface token from the kernel +show a tokenized interface identifier of a particular networking device. +.B Arguments: +coincide with the arguments of +.B ip token set +but the +.I TOKEN +must be left out. +.SS ip token list - list all interface tokens +list all tokenized interface identifiers for the networking interfaces from +the kernel. + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Manpage by Daniel Borkmann diff --git a/man/man8/ip-tunnel.8 b/man/man8/ip-tunnel.8 new file mode 100644 index 0000000..57e030d --- /dev/null +++ b/man/man8/ip-tunnel.8 @@ -0,0 +1,281 @@ +.TH IP\-TUNNEL 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-tunnel - tunnel configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip tunnel help +.sp +.ti -8 +.BR "ip " +.RI "[ " OPTIONS " ]" +.BR "tunnel" " { " add " | " change " | " del " | " show " | " prl " | " 6rd " }" +.RI "[ " NAME " ]" +.br +.RB "[ " mode +.IR MODE " ] [ " +.B remote +.IR ADDR " ] [ " +.B local +.IR ADDR " ]" +.br +.RB "[ [" i "|" o "]" seq " ] [ [" i "|" o "]" key +.IR KEY " ] [ " +.RB "[" i "|" o "]" csum " ] ]" +.br +.RB "[ " encaplimit +.IR ELIM " ]" +.RB "[ " ttl "|" hoplimit +.IR TTL " ]" +.br +.RB "[ " tos +.IR TOS " ] [ " +.B flowlabel +.IR FLOWLABEL " ]" +.br +.RB "[ " prl-default +.IR ADDR " ] [ " +.B prl-nodefault +.IR ADDR " ] [ " +.B prl-delete +.IR ADDR " ]" +.br +.RB "[ " 6rd-prefix +.IR ADDR " ] [" +.B 6rd-relay_prefix +.IR ADDR " ] [ +.BR 6rd-reset " ]" +.br +.RB "[ [" no "]" pmtudisc " ]" +.RB "[ [" no "]" ignore-df " ]" +.RB "[ [" no "]" allow-localremote " ]" +.br +.RB "[ " dev +.IR PHYS_DEV " ]" + +.ti -8 +.IR MODE " := " +.RB " { " ipip " | " gre " | " sit " | " isatap " | " vti " | " ip6ip6 " | " ipip6 " | " ip6gre " | " vti6 " | " any " }" + +.ti -8 +.IR ADDR " := { " IP_ADDRESS " |" +.BR any " }" + +.ti -8 +.IR TOS " := { " STRING " | " 00 ".." ff " |" +.BR inherit " |" +.BI "inherit/" STRING +.RB "|" +.BI "inherit/" 00 ".." ff +.RB "}" + +.ti -8 +.IR ELIM " := {" +.BR none " | " +.IR 0 ".." 255 " }" + +.ti -8 +.ti -8 +.IR TTL " := { " 1 ".." 255 " | " +.BR inherit " }" + +.ti -8 +.IR KEY " := { " DOTTED_QUAD " | " NUMBER " }" + +.SH DESCRIPTION +.B tunnel +objects are tunnels, encapsulating packets in IP packets and then +sending them over the IP infrastructure. +The encapsulating (or outer) address family is specified by the +.B -f +option. The default is IPv4. + +.TP +.B ip tunnel add +add a new tunnel +.TP +.B ip tunnel change +change an existing tunnel +.TP +.B ip tunnel delete +destroy a tunnel +.RS +.TP +.BI name " NAME " (default) +select the tunnel device name. + +.TP +.BI mode " MODE" +set the tunnel mode. Available modes depend on the encapsulating address family. +.br +Modes for IPv4 encapsulation available: +.BR ipip ", " sit ", " isatap ", " vti ", and " gre "." +.br +Modes for IPv6 encapsulation available: +.BR ip6ip6 ", " ipip6 ", " ip6gre ", " vti6 ", and " any "." + +.TP +.BI remote " ADDRESS" +set the remote endpoint of the tunnel. + +.TP +.BI local " ADDRESS" +set the fixed local address for tunneled packets. +It must be an address on another interface of this host. + +.TP +.BI ttl " N" +.TP +.BI hoplimit " N" +set a fixed TTL (IPv4) or hoplimit (IPv6) +.I N +on tunneled packets. +.I N +is a number in the range 1--255. 0 is a special value +meaning that packets inherit the TTL value. +The default value for IPv4 tunnels is: +.BR "inherit" . +The default value for IPv6 tunnels is: +.BR "64" . + + +.TP +.BI tos " T" +.TP +.BI dsfield " T" +.TP +.BI tclass " T" +set the type of service (IPv4) or traffic class (IPv6) field on +tunneled packets, which can be specified as either a two-digit +hex value (e.g. c0) or a predefined string (e.g. internet). +The value +.B inherit +causes the field to be copied from the original IP header. The +values +.BI "inherit/" STRING +or +.BI "inherit/" 00 ".." ff +will set the field to +.I STRING +or +.IR 00 ".." ff +when tunneling non-IP packets. The default value is 00. + +.TP +.BI dev " NAME" +bind the tunnel to the device +.I NAME +so that tunneled packets will only be routed via this device and will +not be able to escape to another device when the route to endpoint +changes. + +.TP +.B nopmtudisc +disable Path MTU Discovery on this tunnel. +It is enabled by default. Note that a fixed ttl is incompatible +with this option: tunneling with a fixed ttl always makes pmtu +discovery. + +.TP +.B ignore-df +enable IPv4 DF suppression on this tunnel. +Normally datagrams that exceed the MTU will be fragmented; the presence +of the DF flag inhibits this, resulting instead in an ICMP Unreachable +(Fragmentation Required) message. Enabling this attribute causes the +DF flag to be ignored. + +.TP +.BI key " K" +.TP +.BI ikey " K" +.TP +.BI okey " K" +.RB ( " only GRE tunnels " ) +use keyed GRE with key +.IR K ". " K +is either a number or an IP address-like dotted quad. +The +.B key +parameter sets the key to use in both directions. +The +.BR ikey " and " okey +parameters set different keys for input and output. + +.TP +.BR csum ", " icsum ", " ocsum +.RB ( " only GRE tunnels " ) +generate/require checksums for tunneled packets. +The +.B ocsum +flag calculates checksums for outgoing packets. +The +.B icsum +flag requires that all input packets have the correct +checksum. The +.B csum +flag is equivalent to the combination +.BR "icsum ocsum" . + +.TP +.BR seq ", " iseq ", " oseq +.RB ( " only GRE tunnels " ) +serialize packets. +The +.B oseq +flag enables sequencing of outgoing packets. +The +.B iseq +flag requires that all input packets are serialized. +The +.B seq +flag is equivalent to the combination +.BR "iseq oseq" . +.B It doesn't work. Don't use it. + +.TP +.BI encaplimit " ELIM" +.RB ( " only IPv6 tunnels " ) +set a fixed encapsulation limit. Default is 4. + +.TP +.BI flowlabel " FLOWLABEL" +.RB ( " only IPv6 tunnels " ) +set a fixed flowlabel. + +.TP +.BI allow-localremote +.RB ( " only IPv6 tunnels " ) +allow remote endpoint on the local host. +.RE + +.TP +.B ip tunnel prl +potential router list (ISATAP only) +.RS +.TP +.BI dev " NAME" +mandatory device name. + +.TP +.BI prl-default " ADDR" +.TP +.BI prl-nodefault " ADDR" +.TP +.BI prl-delete " ADDR" +.RB "Add or delete " ADDR +as a potential router or default router. +.RE + +.TP +.B ip tunnel show +list tunnels +This command has no arguments. + +.SH SEE ALSO +.br +.BR ip (8) + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/ip-vrf.8 b/man/man8/ip-vrf.8 new file mode 100644 index 0000000..946e8f8 --- /dev/null +++ b/man/man8/ip-vrf.8 @@ -0,0 +1,139 @@ +.TH IP\-VRF 8 "7 Dec 2016" "iproute2" "Linux" +.SH NAME +ip-vrf \- run a command against a vrf +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B ip +.B vrf +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.BR "ip vrf show" +.RI "[ " NAME " ]" + +.ti -8 +.BR "ip vrf identify" +.RI "[ " PID " ]" + +.ti -8 +.BR "ip vrf pids" +.I NAME + +.ti -8 +.BR "ip vrf exec " +.RI "[ " NAME " ] " command ... + +.SH DESCRIPTION +A VRF provides traffic isolation at layer 3 for routing, similar to how a +VLAN is used to isolate traffic at layer 2. Fundamentally, a VRF is a separate +routing table. Network devices are associated with a VRF by enslaving the +device to the VRF. At that point network addresses assigned to the device are +local to the VRF with host and connected routes moved to the table associated +with the VRF. + +A process can specify a VRF using several APIs -- binding the socket to the +VRF device using SO_BINDTODEVICE, setting the VRF association using +IP_UNICAST_IF or IPV6_UNICAST_IF, or specifying the VRF for a specific message +using IP_PKTINFO or IPV6_PKTINFO. + +By default a process is not bound to any VRF. An association can be set +explicitly by making the program use one of the APIs mentioned above or +implicitly using a helper to set SO_BINDTODEVICE for all IPv4 and IPv6 +sockets (AF_INET and AF_INET6) when the socket is created. This ip-vrf command +is a helper to run a command against a specific VRF with the VRF association +inherited parent to child. + +.TP +.B ip vrf show [ NAME ] - Show all configured VRF +.sp +This command lists all VRF and their corresponding table ids. If NAME is +given, then only that VRF and table id is shown. The latter command is +useful for scripting where the table id for a VRF is needed. + +.TP +.B ip vrf exec [ NAME ] cmd ... - Run cmd against the named VRF +.sp +This command allows applications that are VRF unaware to be run against +a VRF other than the default VRF (main table). A command can be run against +the default VRF by passing the "default" as the VRF name. This is useful if +the current shell is associated with another VRF (e.g, Management VRF). + +This command requires the system to be booted with cgroup v2 (e.g. with systemd, +add systemd.unified_cgroup_hierarchy=1 to the kernel command line). + +This command also requires to be run as root. Alternatively it +can be run by an unprivileged user if the following +.BR capabilities (7) +are given: + +.RS +.IP \fBCAP_BPF\fP +To load the BPF program. +.IP \fBCAP_NET_ADMIN\fP +To set the socket into the cgroup. +.IP \fBCAP_DAC_OVERRIDE\fP +To create the cgroup subdir in /sys. +.RE + +.IP +If these capabilities are added and if +.BR ip (8) +is built with +.BR libcap (3) +then these capabilities will be dropped before +.BR cmd +is executed by +.B ip vrf exec. +For every other unprivileged invocation of +.BR ip (8) +all capabilities will be dropped. + +.br +.B NOTE: +capabilities will +.B NOT +be dropped if +.B CAP_NET_ADMIN +is set to +.B INHERITABLE +to avoid breaking programs with ambient capabilities that call ip. + +.TP +.B ip vrf identify [PID] - Report VRF association for process +.sp +This command shows the VRF association of the specified process. If PID is +not specified then the id of the current process is used. + +.TP +.B ip vrf pids NAME - Report processes associated with the named VRF +.sp +This command shows all process ids that are associated with the given +VRF. + +.SH CAVEATS +This command requires a kernel compiled with CGROUPS and CGROUP_BPF enabled. + +The VRF helper *only* affects network layer sockets. + +.SH EXAMPLES +.PP +ip vrf exec red ssh 10.100.1.254 +.RS +Executes ssh to 10.100.1.254 against the VRF red table. +.RE + +.SH SEE ALSO +.br +.BR ip (8), +.BR ip-link (8), +.BR ip-address (8), +.BR ip-route (8), +.BR ip-neighbor (8) + +.SH AUTHOR +Original Manpage by David Ahern diff --git a/man/man8/ip-xfrm.8 b/man/man8/ip-xfrm.8 new file mode 100644 index 0000000..6dc73d2 --- /dev/null +++ b/man/man8/ip-xfrm.8 @@ -0,0 +1,775 @@ +'\" t +.TH IP\-XFRM 8 "20 Dec 2011" "iproute2" "Linux" +.SH "NAME" +ip-xfrm \- transform configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B xfrm +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B "ip xfrm" +.IR XFRM-OBJECT " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR XFRM-OBJECT " :=" +.BR state " | " policy " | " monitor +.sp + +.ti -8 +.BR "ip xfrm state" " { " add " | " update " } " +.IR ID " [ " ALGO-LIST " ]" +.RB "[ " mode +.IR MODE " ]" +.RB "[ " mark +.I MARK +.RB "[ " mask +.IR MASK " ] ]" +.RB "[ " reqid +.IR REQID " ]" +.RB "[ " seq +.IR SEQ " ]" +.RB "[ " replay-window +.IR SIZE " ]" +.RB "[ " replay-seq +.IR SEQ " ]" +.RB "[ " replay-oseq +.IR SEQ " ]" +.RB "[ " replay-seq-hi +.IR SEQ " ]" +.RB "[ " replay-oseq-hi +.IR SEQ " ]" +.RB "[ " flag +.IR FLAG-LIST " ]" +.RB "[ " sel +.IR SELECTOR " ] [ " LIMIT-LIST " ]" +.RB "[ " encap +.IR ENCAP " ]" +.RB "[ " coa +.IR ADDR "[/" PLEN "] ]" +.RB "[ " ctx +.IR CTX " ]" +.RB "[ " extra-flag +.IR EXTRA-FLAG-LIST " ]" +.RB "[ " output-mark +.IR OUTPUT-MARK +.RB "[ " mask +.IR MASK " ] ]" +.RB "[ " if_id +.IR IF-ID " ]" +.RB "[ " offload +.RB "[ " crypto | packet " ]" +.RB dev +.IR DEV " +.RB dir +.IR DIR " ]" +.RB "[ " tfcpad +.IR LENGTH " ]" + +.ti -8 +.B "ip xfrm state allocspi" +.I ID +.RB "[ " mode +.IR MODE " ]" +.RB "[ " mark +.I MARK +.RB "[ " mask +.IR MASK " ] ]" +.RB "[ " reqid +.IR REQID " ]" +.RB "[ " seq +.IR SEQ " ]" +.RB "[ " min +.I SPI +.B max +.IR SPI " ]" + +.ti -8 +.BR "ip xfrm state" " { " delete " | " get " } " +.I ID +.RB "[ " mark +.I MARK +.RB "[ " mask +.IR MASK " ] ]" + +.ti -8 +.BR ip " [ " -4 " | " -6 " ] " "xfrm state deleteall" " [" +.IR ID " ]" +.RB "[ " mode +.IR MODE " ]" +.RB "[ " reqid +.IR REQID " ]" +.RB "[ " flag +.IR FLAG-LIST " ]" + +.ti -8 +.BR ip " [ " -4 " | " -6 " ] " "xfrm state list" " [" +.IR ID " ]" +.RB "[ " nokeys " ]" +.RB "[ " mode +.IR MODE " ]" +.RB "[ " reqid +.IR REQID " ]" +.RB "[ " flag +.IR FLAG-LIST " ]" + +.ti -8 +.BR "ip xfrm state flush" " [ " proto +.IR XFRM-PROTO " ]" + +.ti -8 +.BR "ip xfrm state count" + +.ti -8 +.IR ID " :=" +.RB "[ " src +.IR ADDR " ]" +.RB "[ " dst +.IR ADDR " ]" +.RB "[ " proto +.IR XFRM-PROTO " ]" +.RB "[ " spi +.IR SPI " ]" + +.ti -8 +.IR XFRM-PROTO " :=" +.BR esp " | " ah " | " comp " | " route2 " | " hao + +.ti -8 +.IR ALGO-LIST " := [ " ALGO-LIST " ] " ALGO + +.ti -8 +.IR ALGO " :=" +.RB "{ " enc " | " auth " } " +.IR ALGO-NAME " " ALGO-KEYMAT " |" +.br +.B auth-trunc +.IR ALGO-NAME " " ALGO-KEYMAT " " ALGO-TRUNC-LEN " |" +.br +.B aead +.IR ALGO-NAME " " ALGO-KEYMAT " " ALGO-ICV-LEN " |" +.br +.B comp +.IR ALGO-NAME + +.ti -8 +.IR MODE " := " +.BR transport " | " tunnel " | " beet " | " ro " | " in_trigger + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " :=" +.BR noecn " | " decap-dscp " | " nopmtudisc " | " wildrecv " | " icmp " | " +.BR af-unspec " | " align4 " | " esn + +.ti -8 +.IR SELECTOR " :=" +.RB "[ " src +.IR ADDR "[/" PLEN "] ]" +.RB "[ " dst +.IR ADDR "[/" PLEN "] ]" +.RB "[ " dev +.IR DEV " ]" +.br +.RI "[ " UPSPEC " ]" + +.ti -8 +.IR UPSPEC " := " +.BR proto " {" +.IR PROTO " |" +.br +.RB "{ " tcp " | " udp " | " sctp " | " dccp " } [ " sport +.IR PORT " ]" +.RB "[ " dport +.IR PORT " ] |" +.br +.RB "{ " icmp " | " ipv6-icmp " | " mobility-header " } [ " type +.IR NUMBER " ]" +.RB "[ " code +.IR NUMBER " ] |" +.br +.BR gre " [ " key +.RI "{ " DOTTED-QUAD " | " NUMBER " } ] }" + +.ti -8 +.IR LIMIT-LIST " := [ " LIMIT-LIST " ]" +.B limit +.I LIMIT + +.ti -8 +.IR LIMIT " :=" +.RB "{ " time-soft " | " time-hard " | " time-use-soft " | " time-use-hard " }" +.IR "SECONDS" " |" +.br +.RB "{ " byte-soft " | " byte-hard " }" +.IR SIZE " |" +.br +.RB "{ " packet-soft " | " packet-hard " }" +.I COUNT + +.ti -8 +.IR ENCAP " :=" +.RB "{ " espinudp " | " espinudp-nonike " | " espintcp " }" +.IR SPORT " " DPORT " " OADDR + +.ti -8 +.IR EXTRA-FLAG-LIST " := [ " EXTRA-FLAG-LIST " ] " EXTRA-FLAG + +.ti -8 +.IR EXTRA-FLAG " := " +.BR dont-encap-dscp " | " oseq-may-wrap + +.ti -8 +.BR "ip xfrm policy" " { " add " | " update " }" +.I SELECTOR +.B dir +.I DIR +.RB "[ " ctx +.IR CTX " ]" +.RB "[ " mark +.I MARK +.RB "[ " mask +.IR MASK " ] ]" +.RB "[ " index +.IR INDEX " ]" +.RB "[ " ptype +.IR PTYPE " ]" +.RB "[ " action +.IR ACTION " ]" +.RB "[ " priority +.IR PRIORITY " ]" +.RB "[ " flag +.IR FLAG-LIST " ]" +.RB "[ " if_id +.IR IF-ID " ]" +.RB "[ " offload +.RB packet +.RB dev +.IR DEV " ]" +.RI "[ " LIMIT-LIST " ] [ " TMPL-LIST " ]" + +.ti -8 +.BR "ip xfrm policy" " { " delete " | " get " }" +.RI "{ " SELECTOR " | " +.B index +.IR INDEX " }" +.B dir +.I DIR +.RB "[ " ctx +.IR CTX " ]" +.RB "[ " mark +.I MARK +.RB "[ " mask +.IR MASK " ] ]" +.RB "[ " ptype +.IR PTYPE " ]" +.RB "[ " if_id +.IR IF-ID " ]" + +.ti -8 +.BR ip " [ " -4 " | " -6 " ] " "xfrm policy" " { " deleteall " | " list " }" +.RB "[ " nosock " ]" +.RI "[ " SELECTOR " ]" +.RB "[ " dir +.IR DIR " ]" +.RB "[ " index +.IR INDEX " ]" +.RB "[ " ptype +.IR PTYPE " ]" +.RB "[ " action +.IR ACTION " ]" +.RB "[ " priority +.IR PRIORITY " ]" +.RB "[ " flag +.IR FLAG-LIST "]" + +.ti -8 +.B "ip xfrm policy flush" +.RB "[ " ptype +.IR PTYPE " ]" + +.ti -8 +.B "ip xfrm policy count" + +.ti -8 +.B "ip xfrm policy set" +.RB "[ " hthresh4 +.IR LBITS " " RBITS " ]" +.RB "[ " hthresh6 +.IR LBITS " " RBITS " ]" + +.ti -8 +.B "ip xfrm policy setdefault" +.IR DIR +.IR ACTION " [ " +.IR DIR +.IR ACTION " ] [ " +.IR DIR +.IR ACTION " ]" + +.ti -8 +.B "ip xfrm policy getdefault" + +.ti -8 +.IR SELECTOR " :=" +.RB "[ " src +.IR ADDR "[/" PLEN "] ]" +.RB "[ " dst +.IR ADDR "[/" PLEN "] ]" +.RB "[ " dev +.IR DEV " ]" +.RI "[ " UPSPEC " ]" + +.ti -8 +.IR UPSPEC " := " +.BR proto " {" +.IR PROTO " |" +.br +.RB "{ " tcp " | " udp " | " sctp " | " dccp " } [ " sport +.IR PORT " ]" +.RB "[ " dport +.IR PORT " ] |" +.br +.RB "{ " icmp " | " ipv6-icmp " | " mobility-header " } [ " type +.IR NUMBER " ]" +.RB "[ " code +.IR NUMBER " ] |" +.br +.BR gre " [ " key +.RI "{ " DOTTED-QUAD " | " NUMBER " } ] }" + +.ti -8 +.IR DIR " := " +.BR in " | " out " | " fwd + +.ti -8 +.IR PTYPE " := " +.BR main " | " sub + +.ti -8 +.IR ACTION " := " +.BR allow " | " block + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " :=" +.BR localok " | " icmp + +.ti -8 +.IR LIMIT-LIST " := [ " LIMIT-LIST " ]" +.B limit +.I LIMIT + +.ti -8 +.IR LIMIT " :=" +.RB "{ " time-soft " | " time-hard " | " time-use-soft " | " time-use-hard " }" +.IR "SECONDS" " |" +.br +.RB "{ " byte-soft " | " byte-hard " }" +.IR SIZE " |" +.br +.RB "{ " packet-soft " | " packet-hard " }" +.I COUNT + +.ti -8 +.IR TMPL-LIST " := [ " TMPL-LIST " ]" +.B tmpl +.I TMPL + +.ti -8 +.IR TMPL " := " ID +.RB "[ " mode +.IR MODE " ]" +.RB "[ " reqid +.IR REQID " ]" +.RB "[ " level +.IR LEVEL " ]" + +.ti -8 +.IR ID " :=" +.RB "[ " src +.IR ADDR " ]" +.RB "[ " dst +.IR ADDR " ]" +.RB "[ " proto +.IR XFRM-PROTO " ]" +.RB "[ " spi +.IR SPI " ]" + +.ti -8 +.IR XFRM-PROTO " :=" +.BR esp " | " ah " | " comp " | " route2 " | " hao + +.ti -8 +.IR MODE " := " +.BR transport " | " tunnel " | " beet " | " ro " | " in_trigger + +.ti -8 +.IR LEVEL " :=" +.BR required " | " use + +.ti -8 +.BR "ip xfrm monitor" " [" +.BI all-nsid +] [ +.BI nokeys +] [ +.BI all + | +.IR LISTofXFRM-OBJECTS " ]" + +.ti -8 +.IR LISTofXFRM-OBJECTS " := [ " LISTofXFRM-OBJECTS " ] " XFRM-OBJECT + +.ti -8 +.IR XFRM-OBJECT " := " +.BR acquire " | " expire " | " SA " | " policy " | " aevent " | " report + +.in -8 +.ad b + +.SH DESCRIPTION + +xfrm is an IP framework for transforming packets (such as encrypting +their payloads). This framework is used to implement the IPsec protocol +suite (with the +.B state +object operating on the Security Association Database, and the +.B policy +object operating on the Security Policy Database). It is also used for +the IP Payload Compression Protocol and features of Mobile IPv6. + +.TS +l l. +ip xfrm state add add new state into xfrm +ip xfrm state update update existing state in xfrm +ip xfrm state allocspi allocate an SPI value +ip xfrm state delete delete existing state in xfrm +ip xfrm state get get existing state in xfrm +ip xfrm state deleteall delete all existing state in xfrm +ip xfrm state list print out the list of existing state in xfrm +ip xfrm state flush flush all state in xfrm +ip xfrm state count count all existing state in xfrm +.TE + +.TP +.IR ID +is specified by a source address, destination address, +.RI "transform protocol " XFRM-PROTO "," +and/or Security Parameter Index +.IR SPI "." +(For IP Payload Compression, the Compression Parameter Index or CPI is used for +.IR SPI ".)" + +.TP +.I XFRM-PROTO +specifies a transform protocol: +.RB "IPsec Encapsulating Security Payload (" esp ")," +.RB "IPsec Authentication Header (" ah ")," +.RB "IP Payload Compression (" comp ")," +.RB "Mobile IPv6 Type 2 Routing Header (" route2 "), or" +.RB "Mobile IPv6 Home Address Option (" hao ")." + +.TP +.I ALGO-LIST +contains one or more algorithms to use. Each algorithm +.I ALGO +is specified by: +.RS +.IP \[bu] +the algorithm type: +.RB "encryption (" enc ")," +.RB "authentication (" auth " or " auth-trunc ")," +.RB "authenticated encryption with associated data (" aead "), or" +.RB "compression (" comp ")" +.IP \[bu] +the algorithm name +.IR ALGO-NAME +(see below) +.IP \[bu] +.RB "(for all except " comp ")" +the keying material +.IR ALGO-KEYMAT "," +which may include both a key and a salt or nonce value; refer to the +corresponding RFC +.IP \[bu] +.RB "(for " auth-trunc " only)" +the truncation length +.I ALGO-TRUNC-LEN +in bits +.IP \[bu] +.RB "(for " aead " only)" +the Integrity Check Value length +.I ALGO-ICV-LEN +in bits +.RE + +.nh +.RS +Encryption algorithms include +.BR ecb(cipher_null) ", " cbc(des) ", " cbc(des3_ede) ", " cbc(cast5) "," +.BR cbc(blowfish) ", " cbc(aes) ", " cbc(serpent) ", " cbc(camellia) "," +.BR cbc(twofish) ", and " rfc3686(ctr(aes)) "." + +Authentication algorithms include +.BR digest_null ", " hmac(md5) ", " hmac(sha1) ", " hmac(sha256) "," +.BR hmac(sha384) ", " hmac(sha512) ", " hmac(rmd160) ", and " xcbc(aes) "." + +Authenticated encryption with associated data (AEAD) algorithms include +.BR rfc4106(gcm(aes)) ", " rfc4309(ccm(aes)) ", and " rfc4543(gcm(aes)) "." + +Compression algorithms include +.BR deflate ", " lzs ", and " lzjh "." +.RE +.hy + +.TP +.I MODE +specifies a mode of operation for the transform protocol. IPsec and IP Payload +Compression modes are +.BR transport ", " tunnel "," +and (for IPsec ESP only) Bound End-to-End Tunnel +.RB "(" beet ")." +Mobile IPv6 modes are route optimization +.RB "(" ro ")" +and inbound trigger +.RB "(" in_trigger ")." + +.TP +.I FLAG-LIST +contains one or more of the following optional flags: +.BR noecn ", " decap-dscp ", " nopmtudisc ", " wildrecv ", " icmp ", " +.BR af-unspec ", " align4 ", or " esn "." + +.TP +.IR SELECTOR +selects the traffic that will be controlled by the policy, based on the source +address, the destination address, the network device, and/or +.IR UPSPEC "." + +.TP +.IR UPSPEC +selects traffic by protocol. For the +.BR tcp ", " udp ", " sctp ", or " dccp +protocols, the source and destination port can optionally be specified. +For the +.BR icmp ", " ipv6-icmp ", or " mobility-header +protocols, the type and code numbers can optionally be specified. +For the +.B gre +protocol, the key can optionally be specified as a dotted-quad or number. +Other protocols can be selected by name or number +.IR PROTO "." + +.TP +.I LIMIT-LIST +sets limits in seconds, bytes, or numbers of packets. + +.TP +.I ENCAP +encapsulates packets with protocol +.BR espinudp ", " espinudp-nonike ", or " espintcp "," +.RI "using source port " SPORT ", destination port " DPORT +.RI ", and original address " OADDR "." + +.TP +.I MARK +used to match xfrm policies and states + +.TP +.I OUTPUT-MARK +used to set the output mark to influence the routing +of the packets emitted by the state + +.TP +.I IF-ID +xfrm interface identifier used to in both xfrm policies and states + +.TP +.I DEV +Network interface name used to offload policies and states + +.sp +.PP +.TS +l l. +ip xfrm policy add add a new policy +ip xfrm policy update update an existing policy +ip xfrm policy delete delete an existing policy +ip xfrm policy get get an existing policy +ip xfrm policy deleteall delete all existing xfrm policies +ip xfrm policy list print out the list of xfrm policies +ip xfrm policy flush flush policies +.TE + +.TP +.BR nosock +filter (remove) all socket policies from the output. + +.TP +.IR SELECTOR +selects the traffic that will be controlled by the policy, based on the source +address, the destination address, the network device, and/or +.IR UPSPEC "." + +.TP +.IR UPSPEC +selects traffic by protocol. For the +.BR tcp ", " udp ", " sctp ", or " dccp +protocols, the source and destination port can optionally be specified. +For the +.BR icmp ", " ipv6-icmp ", or " mobility-header +protocols, the type and code numbers can optionally be specified. +For the +.B gre +protocol, the key can optionally be specified as a dotted-quad or number. +Other protocols can be selected by name or number +.IR PROTO "." + +.TP +.I DIR +selects the policy direction as +.BR in ", " out ", or " fwd "." + +.TP +.I CTX +sets the security context. + +.TP +.I PTYPE +can be +.BR main " (default) or " sub "." + +.TP +.I ACTION +can be +.BR allow " (default) or " block "." + +.TP +.I PRIORITY +is a number that defaults to zero. + +.TP +.I FLAG-LIST +contains one or both of the following optional flags: +.BR local " or " icmp "." + +.TP +.I LIMIT-LIST +sets limits in seconds, bytes, or numbers of packets. + +.TP +.I TMPL-LIST +is a template list specified using +.IR ID ", " MODE ", " REQID ", and/or " LEVEL ". " + +.TP +.IR ID +is specified by a source address, destination address, +.RI "transform protocol " XFRM-PROTO "," +and/or Security Parameter Index +.IR SPI "." +(For IP Payload Compression, the Compression Parameter Index or CPI is used for +.IR SPI ".)" + +.TP +.I XFRM-PROTO +specifies a transform protocol: +.RB "IPsec Encapsulating Security Payload (" esp ")," +.RB "IPsec Authentication Header (" ah ")," +.RB "IP Payload Compression (" comp ")," +.RB "Mobile IPv6 Type 2 Routing Header (" route2 "), or" +.RB "Mobile IPv6 Home Address Option (" hao ")." + +.TP +.I MODE +specifies a mode of operation for the transform protocol. IPsec and IP Payload +Compression modes are +.BR transport ", " tunnel "," +and (for IPsec ESP only) Bound End-to-End Tunnel +.RB "(" beet ")." +Mobile IPv6 modes are route optimization +.RB "(" ro ")" +and inbound trigger +.RB "(" in_trigger ")." + +.TP +.I LEVEL +can be +.BR required " (default) or " use "." + +.sp +.PP +.TS +l l. +ip xfrm policy count count existing policies +.TE + +.PP +Use one or more -s options to display more details, including policy hash table +information. + +.sp +.PP +.TS +l l. +ip xfrm policy set configure the policy hash table +.TE + +.PP +Security policies whose address prefix lengths are greater than or equal +policy hash table thresholds are hashed. Others are stored in the +policy_inexact chained list. + +.TP +.I LBITS +specifies the minimum local address prefix length of policies that are +stored in the Security Policy Database hash table. + +.TP +.I RBITS +specifies the minimum remote address prefix length of policies that are +stored in the Security Policy Database hash table. + +.sp +.PP +.TS +l l. +ip xfrm monitor state monitoring for xfrm objects +.TE + +.PP +The xfrm objects to monitor can be optionally specified. + +.P +If the +.BI all-nsid +option is set, the program listens to all network namespaces that have a +nsid assigned into the network namespace were the program is running. +A prefix is displayed to show the network namespace where the message +originates. Example: +.sp +.in +2 +[nsid 1]Flushed state proto 0 +.in -2 +.sp + +.SH AUTHOR +Manpage revised by David Ward <david.ward@ll.mit.edu> +.br +Manpage revised by Christophe Gouault <christophe.gouault@6wind.com> +.br +Manpage revised by Nicolas Dichtel <nicolas.dichtel@6wind.com> diff --git a/man/man8/ip.8 b/man/man8/ip.8 new file mode 100644 index 0000000..fdae57c --- /dev/null +++ b/man/man8/ip.8 @@ -0,0 +1,476 @@ +.TH IP 8 "20 Dec 2011" "iproute2" "Linux" +.SH NAME +ip \- show / manipulate routing, network devices, interfaces and tunnels +.SH SYNOPSIS + +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B ip +.RB "[ " -force " ] " +.BI "-batch " filename +.sp + +.ti -8 +.IR OBJECT " := { " +.BR address " | " addrlabel " | " fou " | " help " | " ila " | " ioam " | "\ + l2tp " | " link " | " macsec " | " maddress " | " monitor " | " mptcp " | "\ + mroute " | " mrule " | " neighbor " | " neighbour " | " netconf " | "\ + netns " | " nexthop " | " ntable " | " ntbl " | " route " | " rule " | "\ + sr " | " tap " | " tcpmetrics " | " token " | " tunnel " | " tuntap " | "\ + vrf " | " xfrm " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-h\fR[\fIuman-readable\fR] | +\fB\-s\fR[\fItatistics\fR] | +\fB\-d\fR[\fIetails\fR] | +\fB\-r\fR[\fIesolve\fR] | +\fB\-iec\fR | +\fB\-f\fR[\fIamily\fR] { +.BR inet " | " inet6 " | " link " } | " +\fB-4\fR | +\fB-6\fR | +\fB-B\fR | +\fB-0\fR | +\fB-l\fR[\fIoops\fR] { \fBmaximum-addr-flush-attempts\fR } | +\fB\-o\fR[\fIneline\fR] | +\fB\-rc\fR[\fIvbuf\fR] [\fBsize\fR] | +\fB\-t\fR[\fIimestamp\fR] | +\fB\-ts\fR[\fIhort\fR] | +\fB\-n\fR[\fIetns\fR] name | +\fB\-N\fR[\fIumeric\fR] | +\fB\-a\fR[\fIll\fR] | +\fB\-c\fR[\fIolor\fR] | +\fB\-br\fR[\fIief\fR] | +\fB\-j\fR[son\fR] | +\fB\-p\fR[retty\fR] } + +.SH OPTIONS + +.TP +.BR "\-V" , " -Version" +Print the version of the +.B ip +utility and exit. + +.TP +.BR "\-h", " \-human", " \-human-readable" +output statistics with human readable values followed by suffix. + +.TP +.BR "\-b", " \-batch " <FILENAME> +Read commands from provided file or standard input and invoke them. +First failure will cause termination of ip. + +.TP +.BR "\-force" +Don't terminate ip on errors in batch mode. If there were any errors +during execution of the commands, the application return code will be +non zero. + +.TP +.BR "\-s" , " \-stats" , " \-statistics" +Output more information. If the option +appears twice or more, the amount of information increases. +As a rule, the information is statistics or some time values. + +.TP +.BR "\-d" , " \-details" +Output more detailed information. + +.TP +.BR "\-l" , " \-loops " <COUNT> +Specify maximum number of loops the 'ip address flush' logic +will attempt before giving up. The default is 10. +Zero (0) means loop until all addresses are removed. + +.TP +.BR "\-f" , " \-family " <FAMILY> +Specifies the protocol family to use. The protocol family identifier +can be one of +.BR "inet" , " inet6" , " bridge" , " mpls" +or +.BR link . +If this option is not present, +the protocol family is guessed from other arguments. If the rest +of the command line does not give enough information to guess the +family, +.B ip +falls back to the default one, usually +.B inet +or +.BR "any" . +.B link +is a special family identifier meaning that no networking protocol +is involved. + +.TP +.B \-4 +shortcut for +.BR "-family inet" . + +.TP +.B \-6 +shortcut for +.BR "\-family inet6" . + +.TP +.B \-B +shortcut for +.BR "\-family bridge" . + +.TP +.B \-M +shortcut for +.BR "\-family mpls" . + +.TP +.B \-0 +shortcut for +.BR "\-family link" . + +.TP +.BR "\-o" , " \-oneline" +output each record on a single line, replacing line feeds +with the +.B '\e' +character. This is convenient when you want to count records +with +.BR wc (1) +or to +.BR grep (1) +the output. + +.TP +.BR "\-r" , " \-resolve" +use the system's name resolver to print DNS names instead of +host addresses. + +.TP +.BR "\-n" , " \-netns " <NETNS> +switches +.B ip +to the specified network namespace +.IR NETNS . +Actually it just simplifies executing of: + +.B ip netns exec +.IR NETNS +.B ip +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +to + +.B ip +.RI "-n[etns] " NETNS " [ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +.TP +.BR "\-N" , " \-Numeric" +Print the number of protocol, scope, dsfield, etc directly instead of +converting it to human readable name. + +.TP +.BR "\-a" , " \-all" +executes specified command over all objects, it depends if command +supports this option. + +.TP +.BR \-c [ color ][ = { always | auto | never } +Configure color output. If parameter is omitted or +.BR always , +color output is enabled regardless of stdout state. If parameter is +.BR auto , +stdout is checked to be a terminal before enabling color output. If +parameter is +.BR never , +color output is disabled. If specified multiple times, the last one takes +precedence. This flag is ignored if +.B \-json +is also given. + +Used color palette can be influenced by +.BR COLORFGBG +environment variable +(see +.BR ENVIRONMENT ). + +.TP +.BR "\-t" , " \-timestamp" +display current time when using monitor option. + +.TP +.BR "\-ts" , " \-tshort" +Like +.BR \-timestamp , +but use shorter format. + +.TP +.BR "\-rc" , " \-rcvbuf" <SIZE> +Set the netlink socket receive buffer size, defaults to 1MB. + +.TP +.BR "\-iec" +print human readable rates in IEC units (e.g. 1Ki = 1024). + +.TP +.BR "\-br" , " \-brief" +Print only basic information in a tabular format for better +readability. This option is currently only supported by +.BR "ip addr show ", " ip link show " & " ip neigh show " commands. + +.TP +.BR "\-j", " \-json" +Output results in JavaScript Object Notation (JSON). + +.TP +.BR "\-p", " \-pretty" +The default JSON format is compact and more efficient to parse but +hard for most users to read. This flag adds indentation for +readability. + +.TP +.BR "\-echo" +Request the kernel to send the applied configuration back. + +.SH IP - COMMAND SYNTAX + +.SS +.I OBJECT + +.TP +.B address +- protocol (IP or IPv6) address on a device. + +.TP +.B addrlabel +- label configuration for protocol address selection. + +.TP +.B fou +- Foo-over-UDP receive port configuration. + +.TP +.B ila +- manage identifier locator addresses (ILA). + +.TP +.B ioam +- manage IOAM namespaces and IOAM schemas. + +.TP +.B l2tp +- tunnel ethernet over IP (L2TPv3). + +.TP +.B link +- network device. + +.TP +.B macsec +- MACsec device configuration. + +.TP +.B maddress +- multicast address. + +.TP +.B monitor +- watch for netlink messages. + +.TP +.B mptcp +- manage MPTCP path manager. + +.TP +.B mroute +- multicast routing cache entry. + +.TP +.B mrule +- rule in multicast routing policy database. + +.TP +.B neighbour +- manage ARP or NDISC cache entries. + +.TP +.B netconf +- network configuration monitoring. + +.TP +.B netns +- manage network namespaces. + +.TP +.B nexthop +- manage nexthop objects. + +.TP +.B ntable +- manage the neighbor cache's operation. + +.TP +.B route +- routing table entry. + +.TP +.B rule +- rule in routing policy database. + +.TP +.B sr +- manage IPv6 segment routing. + +.TP +.B stats +- manage and show interface statistics. + +.TP +.B tcp_metrics/tcpmetrics +- manage TCP Metrics. + +.TP +.B token +- manage tokenized interface identifiers. + +.TP +.B tunnel +- tunnel over IP. + +.TP +.B tuntap +- manage TUN/TAP devices. + +.TP +.B vrf +- manage virtual routing and forwarding devices. + +.TP +.B xfrm +- manage IPSec policies. + +.PP +The names of all objects may be written in full or +abbreviated form, for example +.B address +can be abbreviated as +.B addr +or just +.B a. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to +.BR "add" , " delete" +and +.B show +(or +.B list +) objects, but some objects do not allow all of these operations +or have some additional commands. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B list +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH ENVIRONMENT +.TP +.B COLORFGBG +If set, it's value is used for detection whether background is dark or +light and use contrast colors for it. + +COLORFGBG environment variable usually contains either two or three +values separated by semicolons; we want the last value in either case. +If this value is 0-6 or 8, chose colors suitable for dark background: + +COLORFGBG=";0" ip -c a + +.SH EXIT STATUS +Exit status is 0 if command was successful, and 1 if there is a syntax error. +If an error was reported by the kernel exit status is 2. + +.SH "EXAMPLES" +.PP +ip addr +.RS 4 +Shows addresses assigned to all network interfaces. +.RE +.PP +ip neigh +.RS 4 +Shows the current neighbour table in kernel. +.RE +.PP +ip link set x up +.RS 4 +Bring up interface x. +.RE +.PP +ip link set x down +.RS 4 +Bring down interface x. +.RE +.PP +ip route +.RS 4 +Show table routes. +.RE + +.SH HISTORY +.B ip +was written by Alexey N. Kuznetsov and added in Linux 2.2. +.SH SEE ALSO +.BR ip-address (8), +.BR ip-addrlabel (8), +.BR ip-fou (8), +.BR ip-ioam (8), +.BR ip-l2tp (8), +.BR ip-link (8), +.BR ip-macsec (8), +.BR ip-maddress (8), +.BR ip-monitor (8), +.BR ip-mptcp (8), +.BR ip-mroute (8), +.BR ip-neighbour (8), +.BR ip-netconf (8), +.BR ip-netns (8), +.BR ip-nexthop (8), +.BR ip-ntable (8), +.BR ip-route (8), +.BR ip-rule (8), +.BR ip-sr (8), +.BR ip-stats (8), +.BR ip-tcp_metrics (8), +.BR ip-token (8), +.BR ip-tunnel (8), +.BR ip-vrf (8), +.BR ip-xfrm (8) +.br +.RB "IP Command reference " ip-cref.ps +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Original Manpage by Michail Litvak <mci@owl.openwall.com> diff --git a/man/man8/lnstat.8 b/man/man8/lnstat.8 new file mode 100644 index 0000000..b98241b --- /dev/null +++ b/man/man8/lnstat.8 @@ -0,0 +1,262 @@ +.TH LNSTAT 8 +.SH NAME +lnstat \- unified linux network statistics +.SH SYNOPSIS +.B lnstat +.RI [ options ] +.SH DESCRIPTION +This manual page documents briefly the +.B lnstat +command. +.PP +\fBlnstat\fP is a generalized and more feature-complete replacement for the old +rtstat program. It is commonly used to periodically print a selection of +statistical values exported by the kernel. +In addition to routing cache statistics, it supports any kind of statistics the +linux kernel exports via a file in /proc/net/stat/. +.PP +Each file in /proc/net/stat/ contains a header line listing the column names. +These names are used by \fBlnstat\fP as keys for selecting which statistics to +print. For every CPU present in the system, a line follows which lists the +actual values for each column of the file. \fBlnstat\fP sums these values up +(which in fact are counters) before printing them. After each interval, only +the difference to the last value is printed. +.PP +Files and columns may be selected by using the \fB-f\fP and \fB-k\fP +parameters. By default, all columns of all files are printed. +.SH OPTIONS +lnstat supports the following options. +.TP +.B \-h, \-\-help +Show summary of options. +.TP +.B \-V, \-\-version +Show version of program. +.TP +.B \-c, \-\-count <count> +Print <count> number of intervals. +.TP +.B \-d, \-\-dump +Dump list of available files/keys. +.TP +.B \-f, \-\-file <file> +Statistics file to use, may be specified multiple times. By default all files in /proc/net/stat are scanned. +.TP +.B \-i, \-\-interval <intv> +Set interval to 'intv' seconds. +.TP +.B \-j, \-\-json +Display results in JSON format +.TP +.B \-k, \-\-keys k,k,k,... +Display only keys specified. Each key \fBk\fP is of the form \fB[file:]key\fP. If \fB<file>\fP +is given, the search for the given key is limited to that file. Otherwise the first file containing +the searched key is being used. +.TP +.B \-s, \-\-subject [0-2] +Specify display of subject/header. '0' means no header at all, '1' prints a header only at start of the program and '2' prints a header every 20 lines. +.TP +.B \-w, \-\-width n,n,n,... +Width for each field. +.SH USAGE EXAMPLES +.TP +.B # lnstat -d +Get a list of supported statistics files. +.TP +.B # lnstat -k arp_cache:entries,rt_cache:in_hit,arp_cache:destroys +Select the specified files and keys. +.TP +.B # lnstat -i 10 +Use an interval of 10 seconds. +.TP +.B # lnstat -f ip_conntrack +Use only the specified file for statistics. +.TP +.B # lnstat -s 0 +Do not print a header at all. +.TP +.B # lnstat -s 20 +Print a header at start and every 20 lines. +.TP +.B # lnstat -c -1 -i 1 -f rt_cache -k entries,in_hit,in_slow_tot +Display statistics for keys entries, in_hit and in_slow_tot of field rt_cache every second. + +.SH FILES +.TP +.B /proc/net/stat/arp_cache, /proc/net/stat/ndisc_cache +Statistics around neighbor cache and ARP. \fBarp_cache\fP is for IPv4, \fBndisc_cache\fP is the same for IPv6. +.sp +.B entries +Number of entries in the neighbor table. +.sp +.B allocs +How many neighbor entries have been allocated. +.sp +.B destroys +How many neighbor entries have been removed. +.sp +.B hash_grows +How often the neighbor (hash) table was increased. +.sp +.B lookups +How many lookups were performed. +.sp +.B hits +How many \fBlookups\fP were successful. +.sp +.B res_failed +How many neighbor lookups failed. +.sp +.B rcv_probes_mcast +How many multicast neighbor solicitations were received. (IPv6 only.) +.sp +.B rcv_probes_ucast +How many unicast neighbor solicitations were received. (IPv6 only.) +.sp +.B periodic_gc_runs +How many garbage collection runs were executed. +.sp +.B forced_gc_runs +How many forced garbage collection runs were executed. Happens when adding an +entry and the table is too full. +.sp +.B unresolved_discards +How many neighbor table entries were discarded due to lookup failure. +.sp +.B table_fulls +Number of table overflows. Happens if table is full and forced GC run (see +\fBforced_gc_runs\fP) has failed. + +.TP +.B /proc/net/stat/ip_conntrack, /proc/net/stat/nf_conntrack +Conntrack related counters. \fBip_conntrack\fP is for backwards compatibility +with older userspace only and shows the same data as \fBnf_conntrack\fP. +.sp +.B entries +Number of entries in conntrack table. +.sp +.B searched +Number of conntrack table lookups performed. +.sp +.B found +Number of \fBsearched\fP entries which were successful. +.sp +.B new +Number of conntrack entries added which were not expected before. +.sp +.B invalid +Number of packets seen which can not be tracked. +.sp +.B ignore +Number of packets seen which are already connected to a conntrack entry. +.sp +.B delete +Number of conntrack entries which were removed. +.sp +.B delete_list +Number of conntrack entries which were put to dying list. +.sp +.B insert +Number of entries inserted into the list. +.sp +.B insert_failed +Number of entries for which list insertion was attempted but failed (happens if +the same entry is already present). +.sp +.B drop +Number of packets dropped due to conntrack failure. Either new conntrack entry +allocation failed, or protocol helper dropped the packet. +.sp +.B early_drop +Number of dropped conntrack entries to make room for new ones, if maximum table +size was reached. +.sp +.B icmp_error +Number of packets which could not be tracked due to error situation. This is a +subset of \fBinvalid\fP. +.sp +.B expect_new +Number of conntrack entries added after an expectation for them was already +present. +.sp +.B expect_create +Number of expectations added. +.sp +.B expect_delete +Number of expectations deleted. +.sp +.B search_restart +Number of conntrack table lookups which had to be restarted due to hashtable +resizes. + +.TP +.B /proc/net/stat/rt_cache +Routing cache statistics. +.sp +.B entries +Number of entries in routing cache. +.sp +.B in_hit +Number of route cache hits for incoming packets. Deprecated since IP route +cache removal, therefore always zero. +.sp +.B in_slow_tot +Number of routing cache entries added for input traffic. +.sp +.B in_slow_mc +Number of multicast routing cache entries added for input traffic. +.sp +.B in_no_route +Number of input packets for which no routing table entry was found. +.sp +.B in_brd +Number of matched input broadcast packets. +.sp +.B in_martian_dst +Number of incoming martian destination packets. +.sp +.B in_martian_src +Number of incoming martian source packets. +.sp +.B out_hit +Number of route cache hits for outgoing packets. Deprecated since IP route +cache removal, therefore always zero. +.sp +.B out_slow_tot +Number of routing cache entries added for output traffic. +.sp +.B out_slow_mc +Number of multicast routing cache entries added for output traffic. +.sp +.B gc_total +Total number of garbage collection runs. Deprecated since IP route cache +removal, therefore always zero. +.sp +.B gc_ignored +Number of ignored garbage collection runs due to minimum GC interval not +reached and routing cache not full. Deprecated since IP route cache removal, +therefore always zero. +.sp +.B gc_goal_miss +Number of garbage collector goal misses. Deprecated since IP route cache +removal, therefore always zero. +.sp +.B gc_dst_overflow +Number of destination cache overflows. Deprecated since IP route cache removal, +therefore always zero. +.sp +.B in_hlist_search +Number of hash table list traversals for input traffic. Deprecated since IP +route cache removal, therefore always zero. +.sp +.B out_hlist_search +Number of hash table list traversals for output traffic. Deprecated since IP +route cache removal, therefore always zero. + +.SH SEE ALSO +.BR ip (8) +.br +.SH AUTHOR +lnstat was written by Harald Welte <laforge@gnumonks.org>. +.PP +This manual page was written by Michael Prokop <mika@grml.org> for the Debian project (but may be used by others). diff --git a/man/man8/nstat.8 b/man/man8/nstat.8 new file mode 100644 index 0000000..c703cc8 --- /dev/null +++ b/man/man8/nstat.8 @@ -0,0 +1 @@ +.so man8/rtacct.8 diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8 new file mode 100644 index 0000000..368cdc7 --- /dev/null +++ b/man/man8/rdma-dev.8 @@ -0,0 +1,98 @@ +.TH RDMA\-DEV 8 "06 Jul 2017" "iproute2" "Linux" +.SH NAME +rdma-dev \- RDMA device configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ]" +.B dev +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-d\fR[\fIetails\fR] } + +.ti -8 +.B rdma dev show +.RI "[ " DEV " ]" + +.ti -8 +.B rdma dev set +.RI "[ " DEV " ]" +.BR name +.BR NEWNAME + +.ti -8 +.B rdma dev set +.RI "[ " DEV " ]" +.BR netns +.BR NSNAME + +.ti -8 +.B rdma dev set +.RI "[ " DEV " ]" +.BR adaptive-moderation +.BR [on/off] + +.ti -8 +.B rdma dev help + +.SH "DESCRIPTION" +.SS rdma dev set - rename RDMA device or set network namespace or set RDMA device adaptive-moderation + +.SS rdma dev show - display RDMA device attributes + +.PP +.I "DEV" +- specifies the RDMA device to show. +If this argument is omitted all devices are listed. + +.SH "EXAMPLES" +.PP +rdma dev +.RS 4 +Shows the state of all RDMA devices on the system. +.RE +.PP +rdma dev show mlx5_3 +.RS 4 +Shows the state of specified RDMA device. +.RE +.PP +rdma dev set mlx5_3 name rdma_0 +.RS 4 +Renames the mlx5_3 device to rdma_0. +.RE +.PP +rdma dev set mlx5_3 netns foo +.RS 4 +Changes the network namespace of RDMA device to foo where foo is +previously created using iproute2 ip command. +.RE +.PP +rdma dev set mlx5_3 adaptive-moderation [on/off] +.RS 4 +Sets the state of adaptive interrupt moderation for the RDMA device. +.RE +.RS 4 +This is a global setting for the RDMA device but the value is printed for each CQ individually because the state is constant from CQ allocation. +.RE +.PP + +.SH SEE ALSO +.BR ip (8), +.BR rdma (8), +.BR rdma-link (8), +.BR rdma-resource (8), +.BR rdma-system (8), +.BR rdma-statistic (8), +.br + +.SH AUTHOR +Leon Romanovsky <leonro@mellanox.com> diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8 new file mode 100644 index 0000000..32f8022 --- /dev/null +++ b/man/man8/rdma-link.8 @@ -0,0 +1,104 @@ +.TH RDMA\-LINK 8 "06 Jul 2017" "iproute2" "Linux" +.SH NAME +rdma-link \- rdma link configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B link +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-d\fR[\fIetails\fR] } + +.ti -8 +.B rdma link show +.RI "[ " DEV/PORT_INDEX " ]" + +.ti -8 +.B rdma link add +.BR NAME +.BR type +.BR TYPE +.BR netdev +.BR NETDEV + +.ti -8 +.B rdma link delete +.RI NAME + +.ti -8 +.B rdma link help + +.SH "DESCRIPTION" +.SS rdma link show - display rdma link attributes + +.PP +.I "DEV/PORT_INDEX" +- specifies the RDMA link to show. +If this argument is omitted all links are listed. + +.SS rdma link add NAME type TYPE netdev NETDEV - add an rdma link for the specified type to the network device +.sp +.BR NAME +- specifies the new name of the rdma link to add + +.BR TYPE +- specifies which rdma type to use. Link types: +.sp +.in +8 +.B rxe +- Soft RoCE driver +.sp +.B siw +- Soft iWARP driver +.in -8 + +.BR NETDEV +- specifies the network device to which the link is bound + +.SS rdma link delete NAME - delete an rdma link +.PP +.BR NAME +- specifies the name of the rdma link to delete +.PP + +.SH "EXAMPLES" +.PP +rdma link show +.RS 4 +Shows the state of all rdma links on the system. +.RE +.PP +rdma link show mlx5_2/1 +.RS 4 +Shows the state of specified rdma link. +.RE +.PP +rdma link add rxe_eth0 type rxe netdev eth0 +.RS 4 +Adds a RXE link named rxe_eth0 to network device eth0 +.RE +.PP +rdma link del rxe_eth0 +.RS 4 +Removes RXE link rxe_eth0 +.RE +.PP + +.SH SEE ALSO +.BR rdma (8), +.BR rdma-dev (8), +.BR rdma-resource (8), +.BR rdma-statistic (8), +.br + +.SH AUTHOR +Leon Romanovsky <leonro@mellanox.com> diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8 new file mode 100644 index 0000000..1035478 --- /dev/null +++ b/man/man8/rdma-resource.8 @@ -0,0 +1,125 @@ +.TH RDMA\-RESOURCE 8 "26 Dec 2017" "iproute2" "Linux" +.SH NAME +rdma-resource \- rdma resource configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ] " RESOURCE " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR RESOURCE " := { " +.BR cm_id " | " cq " | " mr " | " pd " | " qp " | " ctx " | " srq " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-j\fR[\fIson\fR] | +\fB\-d\fR[\fIetails\fR] } + +.ti -8 +.B rdma resource show +.RI "[ " DEV/PORT_INDEX " ]" + +.ti -8 +.B rdma resource help + +.SH "DESCRIPTION" +.SS rdma resource show - display rdma resource tracking information + +.PP +.I "DEV/PORT_INDEX" +- specifies the RDMA link to show. +If this argument is omitted all links are listed. + +.SH "EXAMPLES" +.PP +rdma resource show +.RS 4 +Shows summary for all devices on the system. +.RE +.PP +rdma resource show mlx5_2 +.RS 4 +Shows the state of specified rdma device. +.RE +.PP +rdma res show qp link mlx5_4 +.RS 4 +Get all QPs for the specific device. +.RE +.PP +rdma res show qp link mlx5_4/1 +.RS 4 +Get QPs of specific port. +.RE +.PP +rdma res show qp link mlx5_4/0 +.RS 4 +Provide illegal port number (0 is illegal). +.RE +.PP +rdma res show qp link mlx5_4/- +.RS 4 +Get QPs which have not assigned port yet. +.RE +.PP +rdma res show qp link mlx5_4/- -d +.RS 4 +Detailed view. +.RE +.PP +rdma res show qp link mlx5_4/- -dd +.RS 4 +Detailed view including driver-specific details. +.RE +.PP +rdma res show qp link mlx5_4/1 lqpn 0-6 +.RS 4 +Limit to specific Local QPNs. +.RE +.PP +rdma res show qp link mlx5_4/1 lqpn 6 -r +.RS 4 +Driver specific details in raw format. +.RE +.PP +rdma resource show cm_id dst-port 7174 +.RS 4 +Show CM_IDs with destination ip port of 7174. +.RE +.PP +rdma resource show cm_id src-addr 172.16.0.100 +.RS 4 +Show CM_IDs bound to local ip address 172.16.0.100 +.RE +.PP +rdma resource show cq pid 30489 +.RS 4 +Show CQs belonging to pid 30489 +.RE +.PP +rdma resource show ctx ctxn 1 +.RS 4 +Show contexts that have index equal to 1. +.RE +.PP +rdma resource show srq lqpn 5-7 +.RS 4 +Show SRQs that the QPs with lqpn 5-7 are associated with. +.RE +.PP + +.SH SEE ALSO +.BR rdma (8), +.BR rdma-dev (8), +.BR rdma-link (8), +.BR rdma-statistic (8), +.br + +.SH AUTHOR +Leon Romanovsky <leonro@mellanox.com> diff --git a/man/man8/rdma-statistic.8 b/man/man8/rdma-statistic.8 new file mode 100644 index 0000000..7dd2b02 --- /dev/null +++ b/man/man8/rdma-statistic.8 @@ -0,0 +1,255 @@ +.TH RDMA\-STATISTIC 8 "27 June 2019" "iproute2" "Linux" +.SH NAME +rdma-statistic \- RDMA statistic counter configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ]" +.B statistic +.RI "{ " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B rdma statistic +.RI "{ " OBJECT " }" +.B show + +.ti -8 +.B rdma statistic +.RI "[ " OBJECT " ]" +.B show link +.RI "[ " DEV/PORT_INDX " ]" +.RI "[ " FILTER_NAME " " FILTER_VALUE " ]" + +.ti -8 +.B rdma statistic +.IR OBJECT +.B mode + +.ti -8 +.B rdma statistic +.IR OBJECT +.B set +.IR COUNTER_SCOPE +.RI "[ " DEV/PORT_INDEX " ]" +.B auto +.RI "{ " CRITERIA " | " +.BR off " }" + +.ti -8 +.B rdma statistic +.IR OBJECT +.B bind +.IR COUNTER_SCOPE +.RI "[ " DEV/PORT_INDEX " ]" +.RI "[ " OBJECT-ID " ]" +.RI "[ " COUNTER-ID " ]" + +.ti -8 +.B rdma statistic +.IR OBJECT +.B unbind +.IR COUNTER_SCOPE +.RI "[ " DEV/PORT_INDEX " ]" +.RI "[ " COUNTER-ID " ]" +.RI "[ " OBJECT-ID " ]" + +.ti -8 +.B rdma statistic +.B mode +.B "[" supported "]" +.B link +.RI "[ " DEV/PORT_INDEX " ]" + +.ti -8 +.B rdma statistic +.B set +.B link +.RI "[ " DEV/PORT_INDEX " ]" +.B optional-counters +.RI "[ " OPTIONAL-COUNTERS " ]" + +.ti -8 +.B rdma statistic +.B unset +.B link +.RI "[ " DEV/PORT_INDEX " ]" +.B optional-counters + +.ti -8 +.IR COUNTER_SCOPE " := " +.RB "{ " link " | " dev " }" + +.ti -8 +.IR OBJECT " := " +.RB "{ " qp " | " mr " }" + +.ti -8 +.IR CRITERIA " := " +.RB "{ " type " | " pid " }" + +.ti -8 +.IR FILTER_NAME " := " +.RB "{ " cntn " | " lqpn " | " pid " | " qp-type " }" + +.SH "DESCRIPTION" +.SS rdma statistic [object] show - Queries the specified RDMA device for RDMA and driver-specific statistics. Show the default hw counters if object is not specified + +.PP +.I "DEV" +- specifies counters on this RDMA device to show. + +.I "PORT_INDEX" +- specifies counters on this RDMA port to show. + +.I "FILTER_NAME +- specifies a filter to show only the results matching it. + +.SS rdma statistic <object> set - configure counter statistic auto-mode for a specific device/port +In auto mode all objects belong to one category are bind automatically to a single counter set. The "off" is global for all auto modes together. Not applicable for MR's. + +.SS rdma statistic <object> bind - manually bind an object (e.g., a qp) with a counter +When bound the statistics of this object are available in this counter. Not applicable for MR's. + +.SS rdma statistic <object> unbind - manually unbind an object (e.g., a qp) from the counter previously bound +When unbound the statistics of this object are no longer available in this counter; And if object id is not specified then all objects on this counter will be unbound. Not applicable for MR's. + +.I "COUNTER-ID" +- specifies the id of the counter to be bound. +If this argument is omitted then a new counter will be allocated. + +.SS rdma statistic mode - Display the enabled optional counters for each link. + +.SS rdma statistic mode supported - Display the supported optional counters for each link. + +.SS rdma statistic set - Enable a set of optional counters for a specific device/port. + +.I "OPTIONAL-COUNTERS" +- specifies the name of the optional counters to enable. Optional counters that are not specified will be disabled. Note that optional counters are driver-specific. + +.SS rdma statistic unset - Disable all optional counters for a specific device/port. + +.SH "EXAMPLES" +.PP +rdma statistic show +.RS 4 +Shows the state of the default counter of all RDMA devices on the system. +.RE +.PP +rdma statistic show link mlx5_2/1 +.RS 4 +Shows the state of the default counter of specified RDMA port +.RE +.PP +rdma statistic qp show +.RS 4 +Shows the state of all qp counters of all RDMA devices on the system. +.RE +.PP +rdma statistic qp show link mlx5_2/1 +.RS 4 +Shows the state of all qp counters of specified RDMA port. +.RE +.PP +rdma statistic qp show link mlx5_2 pid 30489 +.RS 4 +Shows the state of all qp counters of specified RDMA port and belonging to pid 30489 +.RE +.PP +rdma statistic qp show link mlx5_2 qp-type UD +.RS 4 +Shows the state of all qp counters of specified RDMA port and with QP type UD +.RE +.PP +rdma statistic qp mode +.RS 4 +List current counter mode on all devices +.RE +.PP +rdma statistic qp mode link mlx5_2/1 +.RS 4 +List current counter mode of device mlx5_2 port 1 +.RE +.PP +rdma statistic qp set link mlx5_2/1 auto type on +.RS 4 +On device mlx5_2 port 1, for each new user QP bind it with a counter automatically. Per counter for QPs with same qp type. +.RE +.PP +rdma statistic qp set link mlx5_2/1 auto pid on +.RS 4 +On device mlx5_2 port 1, for each new user QP bind it with a counter automatically. Per counter for QPs with same pid. +.RE +.PP +rdma statistic qp set link mlx5_2/1 auto pid,type on +.RS 4 +On device mlx5_2 port 1, for each new user QP bind it with a counter automatically. Per counter for QPs with same pid and same type. +.RE +.PP +rdma statistic qp set link mlx5_2/1 auto off +.RS 4 +Turn-off auto mode on device mlx5_2 port 1. The allocated counters can be manually accessed. +.RE +.PP +rdma statistic qp bind link mlx5_2/1 lqpn 178 +.RS 4 +On device mlx5_2 port 1, allocate a counter and bind the specified qp on it +.RE +.PP +rdma statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178 +.RS 4 +On device mlx5_2 port 1, bind the specified qp on the specified counter +.RE +.PP +rdma statistic qp unbind link mlx5_2/1 cntn 4 +.RS 4 +On device mlx5_2 port 1, unbind all QPs on the specified counter. After that this counter will be released automatically by the kernel. +.RE +.PP +rdma statistic show mr +.RS 4 +List all currently allocated MR's and their counters. +.RE +.PP +rdma statistic show mr mrn 6 +.RS 4 +Dump a specific MR statistics with mrn 6. Dumps nothing if does not exists. +.RE +.PP +rdma statistic mode link mlx5_2/1 +.RS 4 +Display the optional counters that was enabled on mlx5_2/1. +.RE +.PP +rdma statistic mode supported link mlx5_2/1 +.RS 4 +Display the optional counters that mlx5_2/1 supports. +.RE +.PP +rdma statistic set link mlx5_2/1 optional-counters cc_rx_ce_pkts,cc_rx_cnp_pkts +.RS 4 +Enable the cc_rx_ce_pkts,cc_rx_cnp_pkts counters on device mlx5_2 port 1. +.RE +.PP +rdma statistic unset link mlx5_2/1 optional-counters +.RS 4 +Disable all the optional counters on device mlx5_2 port 1. +.RE + +.SH SEE ALSO +.BR rdma (8), +.BR rdma-dev (8), +.BR rdma-link (8), +.BR rdma-resource (8), +.br + +.SH AUTHORS +Mark Zhang <markz@mellanox.com> +.br +Erez Alfasi <ereza@mellanox.com> +.br +Neta Ostrovsky <netao@nvidia.com> diff --git a/man/man8/rdma-system.8 b/man/man8/rdma-system.8 new file mode 100644 index 0000000..554938e --- /dev/null +++ b/man/man8/rdma-system.8 @@ -0,0 +1,108 @@ +.TH RDMA\-SYSTEM 8 "06 Jul 2017" "iproute2" "Linux" +.SH NAME +rdma-system \- RDMA subsystem configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ]" +.B sys +.RI " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-d\fR[\fIetails\fR] } + +.ti -8 +.B rdma system show + +.ti -8 +.B rdma system set +.BR netns +.BR NEWMODE + +.ti -8 +.B rdma system set +.BR privileged-qkey +.BR NEWSTATE + +.ti -8 +.B rdma system help + +.SH "DESCRIPTION" +.SS rdma system set - set RDMA subsystem network namespace mode or +privileged qkey mode + +.SS rdma system show - display RDMA subsystem network namespace mode and +privileged qkey state + +.PP +.I "NEWMODE" +- specifies the RDMA subsystem mode. Either exclusive or shared. +When user wants to assign dedicated RDMA device to a particular +network namespace, exclusive mode should be set before creating +any network namespace. If there are active network namespaces and if +one or more RDMA devices exist, changing mode from shared to +exclusive returns error code EBUSY. + +When RDMA subsystem is in shared mode, RDMA device is accessible in +all network namespace. When RDMA device isolation among multiple +network namespaces is not needed, shared mode can be used. + +It is preferred to not change the subsystem mode when there is active +RDMA traffic running, even though it is supported. +.PP +.I "NEWSTATE" +- Specifies the new state of the privileged-qkey parameter, either on or off. +This parameter determines whether a non-privileged user is allowed to specify a +controlled QKEY or not. + +.SH "EXAMPLES" +.PP +rdma system show +.RS 4 +Shows the state of RDMA subsystem network namespace mode on the system and +the state of privileged qkey parameter. +.RE +.PP +rdma system set netns exclusive +.RS 4 +Sets the RDMA subsystem in network namespace exclusive mode. In this mode RDMA devices +are visible only in single network namespace. +.RE +.PP +rdma system set netns shared +.RS 4 +Sets the RDMA subsystem in network namespace shared mode. In this mode RDMA devices +are shared among network namespaces. +.RE +.PP +.PP +rdma system set privileged-qkey on +.RS 4 +Sets the privileged-qkey parameter to on. In this state non-privileged user +is allowed to specify a controlled QKEY. +.RE +.PP +rdma system set privileged-qkey off +.RS 4 +Sets the privileged-qkey parameter to off. In this state non-privileged user +is *not* allowed to specify a controlled QKEY. +.RE +.PP + +.SH SEE ALSO +.BR rdma (8), +.BR rdma-link (8), +.BR rdma-resource (8), +.BR network_namespaces (7), +.BR namespaces (7), +.br + +.SH AUTHOR +Parav Pandit <parav@mellanox.com> diff --git a/man/man8/rdma.8 b/man/man8/rdma.8 new file mode 100644 index 0000000..5088b9e --- /dev/null +++ b/man/man8/rdma.8 @@ -0,0 +1,145 @@ +.TH RDMA 8 "28 Mar 2017" "iproute2" "Linux" +.SH NAME +rdma \- RDMA tool +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" +.sp + +.ti -8 +.B rdma +.RB "[ " -force " ] " +.BI "-batch " filename +.sp + +.ti -8 +.IR OBJECT " := { " +.BR dev " | " link " | " resource " | " system " | " statistic " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] | +\fB\-d\fR[\fIetails\fR] | +\fB\-j\fR[\fIson\fR] | +\fB\-o\fR[\fIneline\fR] | +\fB\-p\fR[\fIretty\fR] } + +.SH OPTIONS + +.TP +.BR "\-V" , " -Version" +Print the version of the +.B rdma +tool and exit. + +.TP +.BR "\-b", " \-batch " <FILENAME> +Read commands from provided file or standard input and invoke them. +First failure will cause termination of rdma. + +.TP +.BR "\-force" +Don't terminate rdma on errors in batch mode. +If there were any errors during execution of the commands, the application return code will be non zero. + +.TP +.BR "\-d" , " --details" +Output detailed information. Adding a second \-d includes driver-specific details. + +.TP +.BR "\-r" , " --raw" +Output includes driver-specific details in raw format. + +.TP +.BR "\-p" , " --pretty" +When combined with -j generate a pretty JSON output. + +.TP +.BR "\-j" , " --json" +Generate JSON output. + +.TP +.BR "\-o" , " \-oneline" +output each record on a single line, replacing line feeds +with the +.B '\e' +character. + +.SS +.I OBJECT + +.TP +.B dev +- RDMA device. + +.TP +.B link +- RDMA port related. + +.TP +.B resource +- RDMA resource configuration. + +.TP +.B sys +- RDMA subsystem related. + +.TP +.B statistic +- RDMA counter statistic related. + +.PP +The names of all objects may be written in full or +abbreviated form, for example +.B stats +can be abbreviated as +.B stat +or just +.B s. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to +.B show +(or +.B list +) objects, but some objects do not allow all of these operations +or have some additional commands. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B list +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR rdma-dev (8), +.BR rdma-link (8), +.BR rdma-resource (8), +.BR rdma-system (8), +.BR rdma-statistic (8), +.br + +.SH REPORTING BUGS +Report any bugs to the Linux RDMA mailing list +.B <linux-rdma@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Leon Romanovsky <leonro@mellanox.com> diff --git a/man/man8/routel.8 b/man/man8/routel.8 new file mode 100644 index 0000000..b1668e7 --- /dev/null +++ b/man/man8/routel.8 @@ -0,0 +1,33 @@ +.TH ROUTEL 8 "1 Sept, 2021" "iproute2" "Linux" +.SH "NAME" +routel \- list routes with pretty output format +.SH SYNOPSIS +.B routel +.RI "[ " OPTIONS " ]" +.RI "[ " tablenr +[ \fIip route options...\fR ] ] +.P +.ti 8 +.IR OPTIONS " := {" +\fB-h\fR | \fB--help\fR | +[{\fB-f\fR | \fB--family\fR } +{\fBinet\fR | \fBinet6\fR } | +\fB-4\fR | \fB-6\fR } + +.SH "DESCRIPTION" +.LP +The routel script will list routes in a format that some might consider +easier to interpret then the +.B ip +route list equivalent. + +.SH "AUTHORS" +.LP +Rewritten by Stephen Hemminger <stephen@networkplumber.org>. +.br +Original script by Stephen R. van den Berg <srb@cuci.nl>. +.br +This manual page was written by Andreas Henriksson <andreas@fatal.se>, for the Debian GNU/Linux system. +.SH "SEE ALSO" +.LP +ip(8) diff --git a/man/man8/rtacct.8 b/man/man8/rtacct.8 new file mode 100644 index 0000000..988a6d1 --- /dev/null +++ b/man/man8/rtacct.8 @@ -0,0 +1,61 @@ +.TH RTACCT 8 "27 June, 2007" + +.SH NAME +nstat, rtacct - network statistics tools. + +.SH SYNOPSIS +Usage: nstat [ -h?vVzrnasd:t:jp ] [ PATTERN [ PATTERN ] ] +.br +Usage: rtacct [ -h?vVzrnasd:t: ] [ ListOfRealms ] + +.SH DESCRIPTION +.B nstat +and +.B rtacct +are simple tools to monitor kernel snmp counters and network interface statistics. + +.B nstat +can filter kernel snmp counters by name with one or several specified wildcards. Wildcards are case-insensitive and can include special symbols +.B ? +and +.B * +. + +.SH OPTIONS +.B \-h, \-\-help +Print help +.TP +.B \-V, \-\-version +Print version +.TP +.B \-z, \-\-zeros +Dump zero counters too. By default they are not shown. +.TP +.B \-r, \-\-reset +Reset history. +.TP +.B \-n, \-\-nooutput +Do not display anything, only update history. +.TP +.B \-a, \-\-ignore +Dump absolute values of counters. The default is to calculate increments since the previous use. +.TP +.B \-s, \-\-noupdate +Do not update history, so that the next time you will see counters including values accumulated to the moment of this measurement too. +.TP +.B \-j, \-\-json +Display results in JSON format. +.TP +.B \-p, \-\-pretty +When combined with +.BR \-\-json , +pretty print the output. +.TP +.B \-d, \-\-scan <INTERVAL> +Run in daemon mode collecting statistics. <INTERVAL> is interval between measurements in seconds. +.TP +.B \-t, \-\-interval <INTERVAL> +Time interval to average rates. Default value is 60 seconds. + +.SH SEE ALSO +lnstat(8) diff --git a/man/man8/rtmon.8 b/man/man8/rtmon.8 new file mode 100644 index 0000000..38a2b77 --- /dev/null +++ b/man/man8/rtmon.8 @@ -0,0 +1,68 @@ +.TH RTMON 8 +.SH NAME +rtmon \- listens to and monitors RTnetlink +.SH SYNOPSIS +.B rtmon +.RI "[ options ] file FILE [ all | LISTofOBJECTS ]" +.SH DESCRIPTION +This manual page documents briefly the +.B rtmon +command. +.PP +.B rtmon +listens on +.I netlink +socket and monitors routing table changes. + +.I rtmon +can be started before the first network configuration command is issued. +For example if you insert: + +.B rtmon file /var/log/rtmon.log + +in a startup script, you will be able to view the full history later. +Certainly, it is possible to start rtmon at any time. It prepends the history with the state snapshot dumped at the moment of starting. + +.SH OPTIONS +.I rtmon supports the following options: +.TP +.B \-Version +Print version and exit. +.TP +.B help +Show summary of options. +.TP +.B file FILE [ all | LISTofOBJECTS ] +Log output to FILE. LISTofOBJECTS is the list of object types that we +want to monitor. It may contain 'link', 'address', 'route' +and 'all'. 'link' specifies the network device, 'address' the protocol +(IP or IPv6) address on a device, 'route' the routing table entry +and 'all' does what the name says. +.TP +.B \-family [ inet | inet6 | link | help ] +Specify protocol family. 'inet' is IPv4, 'inet6' is IPv6, 'link' +means that no networking protocol is involved and 'help' prints usage information. +.TP +.B \-4 +Use IPv4. Shortcut for -family inet. +.TP +.B \-6 +Use IPv6. Shortcut for -family inet6. +.TP +.B \-0 +Use a special family identifier meaning that no networking protocol is involved. Shortcut for -family link. +.SH USAGE EXAMPLES +.TP +.B # rtmon file /var/log/rtmon.log +Log to file /var/log/rtmon.log, then run: +.TP +.B # ip monitor file /var/log/rtmon.log +to display logged output from file. +.SH SEE ALSO +.BR ip (8) +.SH AUTHOR +.B rtmon +was written by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>. +.PP +This manual page was written by Michael Prokop <mika@grml.org>, +for the Debian project (but may be used by others). diff --git a/man/man8/rtstat.8 b/man/man8/rtstat.8 new file mode 100644 index 0000000..080e2b2 --- /dev/null +++ b/man/man8/rtstat.8 @@ -0,0 +1 @@ +.so man8/lnstat.8 diff --git a/man/man8/ss.8 b/man/man8/ss.8 new file mode 100644 index 0000000..4ece41f --- /dev/null +++ b/man/man8/ss.8 @@ -0,0 +1,607 @@ +.TH SS 8 +.SH NAME +ss \- another utility to investigate sockets +.SH SYNOPSIS +.B ss +.RI [ options ] " [ FILTER ]" +.SH DESCRIPTION +.B ss +is used to dump socket statistics. It allows showing information similar +to +.IR netstat . +It can display more TCP and state information than other tools. + +.SH OPTIONS +When no option is used ss displays a list of open non-listening +sockets (e.g. TCP/UNIX/UDP) that have established connection. +.TP +.B \-h, \-\-help +Show summary of options. +.TP +.B \-V, \-\-version +Output version information. +.TP +.B \-H, \-\-no-header +Suppress header line. +.TP +.B \-O, \-\-oneline +Print each socket's data on a single line. +.TP +.B \-n, \-\-numeric +Do not try to resolve service names. Show exact bandwidth values, instead of human-readable. +.TP +.B \-r, \-\-resolve +Try to resolve numeric address/ports. +.TP +.B \-a, \-\-all +Display both listening and non-listening (for TCP this means +established connections) sockets. +.TP +.B \-l, \-\-listening +Display only listening sockets (these are omitted by default). +.TP +.B \-B, \-\-bound-inactive +Display only TCP bound but inactive (not listening, connecting, etc.) sockets +(these are omitted by default). +.TP +.B \-o, \-\-options +Show timer information. For TCP protocol, the output format is: +.RS +.P +timer:(<timer_name>,<expire_time>,<retrans>) +.P +.TP +.B <timer_name> +the name of the timer, there are five kind of timer names: +.RS +.P +.B on +: means one of these timers: TCP retrans timer, TCP early retrans +timer and tail loss probe timer +.P +.BR keepalive ": tcp keep alive timer" +.P +.BR timewait ": timewait stage timer" +.P +.BR persist ": zero window probe timer" +.P +.BR unknown ": none of the above timers" +.RE +.TP +.B <expire_time> +how long time the timer will expire +.P +.TP +.B <retrans> +how many times the retransmission occurred +.RE +.TP +.B \-e, \-\-extended +Show detailed socket information. The output format is: +.RS +.P +uid:<uid_number> ino:<inode_number> sk:<cookie> +.P +.TP +.B <uid_number> +the user id the socket belongs to +.P +.TP +.B <inode_number> +the socket's inode number in VFS +.P +.TP +.B <cookie> +an uuid of the socket +.RE +.TP +.B \-m, \-\-memory +Show socket memory usage. The output format is: +.RS +.P +skmem:(r<rmem_alloc>,rb<rcv_buf>,t<wmem_alloc>,tb<snd_buf>, +.br +.RS +.RS +f<fwd_alloc>,w<wmem_queued>,o<opt_mem>, +.RE +.RE +.br +.RS +.RS +bl<back_log>,d<sock_drop>) +.RE +.RE +.P +.TP +.B <rmem_alloc> +the memory allocated for receiving packet +.P +.TP +.B <rcv_buf> +the total memory can be allocated for receiving packet +.P +.TP +.B <wmem_alloc> +the memory used for sending packet (which has been sent to layer 3) +.P +.TP +.B <snd_buf> +the total memory can be allocated for sending packet +.P +.TP +.B <fwd_alloc> +the memory allocated by the socket as cache, but not used for +receiving/sending packet yet. If need memory to send/receive packet, +the memory in this cache will be used before allocate additional +memory. +.P +.TP +.B <wmem_queued> +The memory allocated for sending packet (which has not been sent to layer 3) +.P +.TP +.B <opt_mem> +The memory used for storing socket option, e.g., the key for TCP MD5 signature +.P +.TP +.B <back_log> +The memory used for the sk backlog queue. On a process context, if the +process is receiving packet, and a new packet is received, it will be +put into the sk backlog queue, so it can be received by the process +immediately +.P +.TP +.B <sock_drop> +the number of packets dropped before they are de-multiplexed into the socket +.RE +.TP +.B \-p, \-\-processes +Show process using socket. +.TP +.B \-T, \-\-threads +Show thread using socket. Implies +.BR \-p . +.TP +.B \-i, \-\-info +Show internal TCP information. Below fields may appear: +.RS +.P +.TP +.B ts +show string "ts" if the timestamp option is set +.P +.TP +.B sack +show string "sack" if the sack option is set +.P +.TP +.B ecn +show string "ecn" if the explicit congestion notification option is set +.P +.TP +.B ecnseen +show string "ecnseen" if the saw ecn flag is found in received packets +.P +.TP +.B fastopen +show string "fastopen" if the fastopen option is set +.P +.TP +.B cong_alg +the congestion algorithm name, the default congestion algorithm is "cubic" +.P +.TP +.B wscale:<snd_wscale>:<rcv_wscale> +if window scale option is used, this field shows the send scale factor +and receive scale factor +.P +.TP +.B rto:<icsk_rto> +tcp re-transmission timeout value, the unit is millisecond +.P +.TP +.B backoff:<icsk_backoff> +used for exponential backoff re-transmission, the actual +re-transmission timeout value is icsk_rto << icsk_backoff +.P +.TP +.B rtt:<rtt>/<rttvar> +rtt is the average round trip time, rttvar is the mean deviation of +rtt, their units are millisecond +.P +.TP +.B ato:<ato> +ack timeout, unit is millisecond, used for delay ack mode +.P +.TP +.B mss:<mss> +max segment size +.P +.TP +.B cwnd:<cwnd> +congestion window size +.P +.TP +.B pmtu:<pmtu> +path MTU value +.P +.TP +.B ssthresh:<ssthresh> +tcp congestion window slow start threshold +.P +.TP +.B bytes_acked:<bytes_acked> +bytes acked +.P +.TP +.B bytes_received:<bytes_received> +bytes received +.P +.TP +.B segs_out:<segs_out> +segments sent out +.P +.TP +.B segs_in:<segs_in> +segments received +.P +.TP +.B send <send_bps>bps +egress bps +.P +.TP +.B lastsnd:<lastsnd> +how long time since the last packet sent, the unit is millisecond +.P +.TP +.B lastrcv:<lastrcv> +how long time since the last packet received, the unit is millisecond +.P +.TP +.B lastack:<lastack> +how long time since the last ack received, the unit is millisecond +.P +.TP +.B pacing_rate <pacing_rate>bps/<max_pacing_rate>bps +the pacing rate and max pacing rate +.P +.TP +.B rcv_space:<rcv_space> +a helper variable for TCP internal auto tuning socket receive buffer +.P +.TP +.B tcp-ulp-mptcp flags:[MmBbJjecv] token:<rem_token(rem_id)/loc_token(loc_id)> seq:<sn> sfseq:<ssn> ssnoff:<off> maplen:<maplen> +MPTCP subflow information +.P +.RE +.TP +.B \-\-tos +Show ToS and priority information. Below fields may appear: +.RS +.P +.TP +.B tos +IPv4 Type-of-Service byte +.P +.TP +.B tclass +IPv6 Traffic Class byte +.P +.TP +.B class_id +Class id set by net_cls cgroup. If class is zero this shows priority +set by SO_PRIORITY. +.RE +.TP +.B \-\-cgroup +Show cgroup information. Below fields may appear: +.RS +.P +.TP +.B cgroup +Cgroup v2 pathname. This pathname is relative to the mount point of the hierarchy. +.RE +.TP +.B \-\-tipcinfo +Show internal tipc socket information. +.TP +.B \-K, \-\-kill +Attempts to forcibly close sockets. This option displays sockets that are +successfully closed and silently skips sockets that the kernel does not support +closing. It supports IPv4 and IPv6 sockets only. +.TP +.B \-s, \-\-summary +Print summary statistics. This option does not parse socket lists obtaining +summary from various sources. It is useful when amount of sockets is so huge +that parsing /proc/net/tcp is painful. +.TP +.B \-E, \-\-events +Continually display sockets as they are destroyed +.TP +.B \-Z, \-\-context +As the +.B \-p +option but also shows process security context. If the +.B \-T +option is used, also shows thread security context. +.sp +For +.BR netlink (7) +sockets the initiating process context is displayed as follows: +.RS +.RS +.IP "1." 4 +If valid pid show the process context. +.IP "2." 4 +If destination is kernel (pid = 0) show kernel initial context. +.IP "3." 4 +If a unique identifier has been allocated by the kernel or netlink user, +show context as "unavailable". This will generally indicate that a +process has more than one netlink socket active. +.RE +.RE +.TP +.B \-z, \-\-contexts +As the +.B \-Z +option but also shows the socket context. The socket context is +taken from the associated inode and is not the actual socket +context held by the kernel. Sockets are typically labeled with the +context of the creating process, however the context shown will reflect +any policy role, type and/or range transition rules applied, +and is therefore a useful reference. +.TP +.B \-N NSNAME, \-\-net=NSNAME +Switch to the specified network namespace name. +.TP +.B \-b, \-\-bpf +Show socket classic BPF filters (only administrators are allowed to get these +information). +.TP +.B \-4, \-\-ipv4 +Display only IP version 4 sockets (alias for -f inet). +.TP +.B \-6, \-\-ipv6 +Display only IP version 6 sockets (alias for -f inet6). +.TP +.B \-0, \-\-packet +Display PACKET sockets (alias for -f link). +.TP +.B \-t, \-\-tcp +Display TCP sockets. +.TP +.B \-u, \-\-udp +Display UDP sockets. +.TP +.B \-d, \-\-dccp +Display DCCP sockets. +.TP +.B \-w, \-\-raw +Display RAW sockets. +.TP +.B \-x, \-\-unix +Display Unix domain sockets (alias for -f unix). +.TP +.B \-S, \-\-sctp +Display SCTP sockets. +.TP +.B \-\-tipc +Display tipc sockets (alias for -f tipc). +.TP +.TP +.B \-\-vsock +Display vsock sockets (alias for -f vsock). +.TP +.B \-\-xdp +Display XDP sockets (alias for -f xdp). +.TP +.B \-M, \-\-mptcp +Display MPTCP sockets. +.TP +.B \-\-inet-sockopt +Display inet socket options. +.TP +.B \-f FAMILY, \-\-family=FAMILY +Display sockets of type FAMILY. Currently the following families are +supported: unix, inet, inet6, link, netlink, vsock, tipc, xdp. +.TP +.B \-A QUERY, \-\-query=QUERY, \-\-socket=QUERY +List of socket tables to dump, separated by commas. The following identifiers +are understood: all, inet, tcp, udp, raw, unix, packet, netlink, unix_dgram, +unix_stream, unix_seqpacket, packet_raw, packet_dgram, dccp, sctp, tipc, +vsock_stream, vsock_dgram, xdp, mptcp. Any item in the list may optionally be +prefixed by an exclamation mark +.RB ( ! ) +to exclude that socket table from being dumped. +.TP +.B \-D FILE, \-\-diag=FILE +Do not display anything, just dump raw information about TCP sockets +to FILE after applying filters. If FILE is - stdout is used. +.TP +.B \-F FILE, \-\-filter=FILE +Read filter information from FILE. Each line of FILE is interpreted +like single command line option. If FILE is - stdin is used. +.TP +.B FILTER := [ state STATE-FILTER ] [ EXPRESSION ] +Please take a look at the official documentation for details regarding filters. + +.SH STATE-FILTER + +.B STATE-FILTER +allows one to construct arbitrary set of states to match. Its syntax is +sequence of keywords state and exclude followed by identifier of +state. +.TP +Available identifiers are: + +All standard TCP states: +.BR established ", " syn-sent ", " syn-recv ", " fin-wait-1 ", " fin-wait-2 ", " time-wait ", " closed ", " close-wait ", " last-ack ", " +.BR listening " and " closing. + +.B all +- for all the states + +.B connected +- all the states except for +.BR listening " and " closed + +.B synchronized +- all the +.B connected +states except for +.B syn-sent + +.B bucket +- states, which are maintained as minisockets, i.e. +.BR time-wait " and " syn-recv + +.B big +- opposite to +.B bucket + +.B bound-inactive +- bound but otherwise inactive sockets (not listening, connecting, etc.) + +.SH EXPRESSION + +.B EXPRESSION +allows filtering based on specific criteria. +.B EXPRESSION +consists of a series of predicates combined by boolean operators. The possible operators in increasing +order of precedence are +.B or +(or | or ||), +.B and +(or & or &&), and +.B not +(or !). If no operator is between consecutive predicates, an implicit +.B and +operator is assumed. Subexpressions can be grouped with "(" and ")". +.P +The following predicates are supported: + +.TP +.B {dst|src} [=] HOST +Test if the destination or source matches HOST. See HOST SYNTAX for details. +.TP +.B {dport|sport} [OP] [FAMILY:]:PORT +Compare the destination or source port to PORT. OP can be any of "<", "<=", "=", "!=", +">=" and ">". Following normal arithmetic rules. FAMILY and PORT are as described in +HOST SYNTAX below. +.TP +.B dev [=|!=] DEVICE +Match based on the device the connection uses. DEVICE can either be a device name or the +index of the interface. +.TP +.B fwmark [=|!=] MASK +Matches based on the fwmark value for the connection. This can either be a specific mark value +or a mark value followed by a "/" and a bitmask of which bits to use in the comparison. For example +"fwmark = 0x01/0x03" would match if the two least significant bits of the fwmark were 0x01. +.TP +.B cgroup [=|!=] PATH +Match if the connection is part of a cgroup at the given path. +.TP +.B autobound +Match if the port or path of the source address was automatically allocated +(rather than explicitly specified). +.P +Most operators have aliases. If no operator is supplied "=" is assumed. +Each of the following groups of operators are all equivalent: +.RS +.IP \(bu 2 += == eq +.IP \(bu +!= ne neq +.IP \(bu +> gt +.IP \(bu +< lt +.IP \(bu +>= ge geq +.IP \(bu +<= le leq +.IP \(bu +! not +.IP \(bu +| || or +.IP \(bu +& && and +.RE +.SH HOST SYNTAX +.P +The general host syntax is [FAMILY:]ADDRESS[:PORT]. +.P +FAMILY must be one of the families supported by the -f option. If not given +it defaults to the family given with the -f option, and if that is also +missing, will assume either inet or inet6. Note that all host conditions in the +expression should either all be the same family or be only inet and inet6. If there +is some other mixture of families, the results will probably be unexpected. +.P +The form of ADDRESS and PORT depends on the family used. "*" can be used as +a wildcard for either the address or port. The details for each family are as +follows: +.TP +.B unix +ADDRESS is a glob pattern (see +.BR fnmatch (3)) +that will be matched case-insensitively against the unix socket's address. Both path and abstract +names are supported. Unix addresses do not support a port, and "*" cannot be used as a wildcard. +.TP +.B link +ADDRESS is the case-insensitive name of an Ethernet protocol to match. PORT +is either a device name or a device index for the desired link device, as seen +in the output of ip link. +.TP +.B netlink +ADDRESS is a descriptor of the netlink family. Possible values come from +/etc/iproute2/nl_protos. PORT is the port id of the socket, which is usually +the same as the owning process id. The value "kernel" can be used to represent +the kernel (port id of 0). +.TP +.B vsock +ADDRESS is an integer representing the CID address, and PORT is the port. +.TP +.BR inet \ and\ inet6 +ADDRESS is an ip address (either v4 or v6 depending on the family) or a DNS +hostname that resolves to an ip address of the required version. An ipv6 +address must be enclosed in "[" and "]" to disambiguate the port separator. The +address may additionally have a prefix length given in CIDR notation (a slash +followed by the prefix length in bits). PORT is either the numerical +socket port, or the service name for the port to match. + +.SH USAGE EXAMPLES +.TP +.B ss -t -a +Display all TCP sockets. +.TP +.B ss -t -a -Z +Display all TCP sockets with process SELinux security contexts. +.TP +.B ss -u -a +Display all UDP sockets. +.TP +.B ss -o state established '( dport = :ssh or sport = :ssh )' +Display all established ssh connections. +.TP +.B ss -x src /tmp/.X11-unix/* +Find all local processes connected to X server. +.TP +.B ss -o state fin-wait-1 '( sport = :http or sport = :https )' dst 193.233.7/24 +List all the tcp sockets in state FIN-WAIT-1 for our apache to network +193.233.7/24 and look at their timers. +.TP +.B ss -a -A 'all,!tcp' +List sockets in all states from all socket tables but TCP. +.SH SEE ALSO +.BR ip (8), +.br +.BR RFC " 793 " +- https://tools.ietf.org/rfc/rfc793.txt (TCP states) + +.SH AUTHOR +.I ss +was written by Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>. +.PP +This manual page was written by Michael Prokop <mika@grml.org> +for the Debian project (but may be used by others). diff --git a/man/man8/tc-actions.8 b/man/man8/tc-actions.8 new file mode 100644 index 0000000..5c399cd --- /dev/null +++ b/man/man8/tc-actions.8 @@ -0,0 +1,312 @@ +.TH "actions in tc" 8 "1 Aug 2017" "iproute2" "Linux" + +.SH NAME +actions \- independently defined actions in tc +.SH SYNOPSIS +.B tc +[ +.I TC_OPTIONS +] +.B actions +.BR add " | " change " | " replace +.I ACTSPEC + +.B tc +[ +.I TC_OPTIONS +] +.B actions +.BR get " | " delete +.I ACTISPEC + +.B tc +[ +.I TC_OPTIONS +] +.B actions flush +.I ACTNAMESPEC + +.B tc +[ +.I TC_OPTIONS +] +.B actions +.BR ls " | " list +.I ACTNAMESPEC +[ +.I ACTFILTER +] + +.in +8 +.I ACTSPEC +:= +.B action +.I ACTDETAIL +[ +.I INDEXSPEC +] [ +.I COOKIESPEC +] [ +.I FLAGS +] [ +.I HWSTATSSPEC +] [ +.I CONTROL +] [ +.I SKIPSPEC +] + +.I ACTISPEC +:= +.I ACTNAMESPEC INDEXSPEC + +.I ACTNAMESPEC +:= +.B action +ACTNAME + +.I INDEXSPEC +:= +.BI index " INDEX" + +.I ACTFILTER +:= +.BI since " MSTIME" + +.I COOKIESPEC +:= +.BI cookie " COOKIE" + +.I FLAGS +:= +.I no_percpu + +.I HWSTATSSPEC +:= +.BR hw_stats " {" +.IR immediate " | " delayed " | " disabled " }" + +.I ACTDETAIL +:= +.I ACTNAME ACTPARAMS + +.I ACTNAME +may be any valid action type: gact, mirred, bpf, connmark, csum, police, etc. + +.I MSTIME +Time since last update. + +.I CONTROL +:= { +.IR reclassify " | " pipe " | " drop " | " continue " | " ok +} + +.I SKIPSPEC +:= { +.IR skip_sw " | " skip_hw +} + +.I TC_OPTIONS +These are the options that are specific to +.B tc +and not only the options. Refer to +.BR tc(8) +for more information. +.in + +.SH DESCRIPTION + +The +.B actions +object in +.B tc +allows a user to define actions independently of a classifier (filter). These +actions can then be assigned to one or more filters, with any +packets matching the classifier's criteria having that action performed +on them. + +Each action type (mirred, police, etc.) will have its own table to store +all created actions. + +.SH OPERATIONS +.TP +.B add +Create a new action in that action's table. + +.TP +.B change +.TQ +.B replace +Make modifications to an existing action. +.TP +.B get +Display the action with the specified index value. When combined with the +.B -s +option for +.BR tc "," +display the statistics for that action. +.TP +.B delete +Delete the action with the specified index value. If the action is already +associated with a classifier, it does not delete the classifier. +.TP +.B ls +.TQ +.B list +List all the actions in the specified table. When combined with the +.B -s +option for +.BR tc "," +display the statistics for all actions in the specified table. +When combined with the option +.B since +allows doing a millisecond time-filter since the last time an +action was used in the datapath. +.TP +.B flush +Delete all actions stored in the specified table. + +.SH ACTION OPTIONS +Note that these options are available to all action types. +.TP +.BI index " INDEX" +Specify the table index value of an action. +.I INDEX +is a 32-bit value that is unique to the specific type of action referenced. + +.RS +For +.BR add ", " change ", and" +.B replace +operations, the index is +.BR optional. +When adding a new action, +specifying an index value will assign the action to that index unless that +index value has already been assigned. Omitting the index value for an add +operation will cause the kernel to assign a value to the new action. +.RE + +.RS +For +.BR get " and " delete +operations, the index is +.B required +to identify the specific action to be displayed or deleted. +.RE + +.TP +.BI cookie " COOKIE" +In addition to the specific action, mark the matching packet with the value +specified by +.IR COOKIE "." +The +.I COOKIE +is a 128-bit value that will not be interpreted by the kernel whatsoever. +As such, it can be used as a correlating value for maintaining user state. +The value to be stored is completely arbitrary and does not require a specific +format. It is stored inside the action structure itself. + +.TP +.I FLAGS +Action-specific flags. Currently, the only supported flag is +.I no_percpu +which indicates that action is expected to have minimal software data-path +traffic and doesn't need to allocate stat counters with percpu allocator. +This option is intended to be used by hardware-offloaded actions. + +.TP +.BI hw_stats " HW_STATS" +Specifies the type of HW stats of new action. If omitted, any stats counter type +is going to be used, according to driver and its resources. +The +.I HW_STATS +indicates the type. Any of the following are valid: +.RS +.TP +.B immediate +Means that in dump, user gets the current HW stats state from the device +queried at the dump time. +.TP +.B delayed +Means that in dump, user gets HW stats that might be out of date for +some time, maybe couple of seconds. This is the case when driver polls +stats updates periodically or when it gets async stats update +from the device. +.TP +.B disabled +No HW stats are going to be available in dump. +.RE + +.TP +.BI since " MSTIME" +When dumping large number of actions, a millisecond time-filter can be +specified +.IR MSTIME "." +The +.I MSTIME +is a millisecond count since last time a packet hit the action. +As an example specifying "since 20000" implies to dump all actions +that have seen packets in the last 20 seconds. This option is useful +when the kernel has a large number of actions and you are only interested +in recently used actions. + +.TP +.I CONTROL +The +.I CONTROL +indicates how +.B tc +should proceed after executing the action. Any of the following are valid: +.RS +.TP +.B reclassify +Restart the classifiction by jumping back to the first filter attached to +the action's parent. +.TP +.B pipe +Continue with the next action. This is the default control. +.TP +.B drop +Drop the packed without running any further actions. +.TP +.B continue +Continue the classification with the next filter. +.TP +.B pass +Return to the calling qdisc for packet processing, and end classification of +this packet. +.RE + +.TP +.I SKIPSPEC +The +.I SKIPSPEC +indicates how +.B tc +should proceed when executing the action. Any of the following are valid: +.RS +.TP +.B skip_sw +Do not process action by software. If hardware has no offload support for this +action, operation will fail. +.TP +.B skip_hw +Do not process action by hardware. +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-bpf (8), +.BR tc-connmark (8), +.BR tc-csum (8), +.BR tc-ife (8), +.BR tc-mirred (8), +.BR tc-nat (8), +.BR tc-pedit (8), +.BR tc-police (8), +.BR tc-simple (8), +.BR tc-skbedit (8), +.BR tc-skbmod (8), +.BR tc-tunnel_key (8), +.BR tc-vlan (8), +.BR tc-xt (8) diff --git a/man/man8/tc-basic.8 b/man/man8/tc-basic.8 new file mode 100644 index 0000000..d86d46a --- /dev/null +++ b/man/man8/tc-basic.8 @@ -0,0 +1,34 @@ +.TH "Basic classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +basic \- basic traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " basic " [ " match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ] [ " +.B classid +.IR CLASSID " ]" +.SH DESCRIPTION +The +.B basic +filter allows one to classify packets using the extended match infrastructure. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8) diff --git a/man/man8/tc-bfifo.8 b/man/man8/tc-bfifo.8 new file mode 100644 index 0000000..bc05ef4 --- /dev/null +++ b/man/man8/tc-bfifo.8 @@ -0,0 +1,72 @@ +.TH PBFIFO 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +pfifo \- Packet limited First In, First Out queue +.P +bfifo \- Byte limited First In, First Out queue + +.SH SYNOPSIS +.B tc qdisc ... add pfifo +.B [ limit +packets +.B ] +.P +.B tc qdisc ... add bfifo +.B [ limit +bytes +.B ] + +.SH DESCRIPTION +The pfifo and bfifo qdiscs are unadorned First In, First Out queues. They are the +simplest queues possible and therefore have no overhead. +.B pfifo +constrains the queue size as measured in packets. +.B bfifo +does so as measured in bytes. + +Like all non-default qdiscs, they maintain statistics. This might be a reason to prefer +pfifo or bfifo over the default. + +.SH ALGORITHM +A list of packets is maintained, when a packet is enqueued it gets inserted at the tail of +a list. When a packet needs to be sent out to the network, it is taken from the head of the list. + +If the list is too long, no further packets are allowed on. This is called 'tail drop'. + +.SH PARAMETERS +.TP +limit +Maximum queue size. Specified in bytes for bfifo, in packets for pfifo. For pfifo, defaults +to the interface txqueuelen, as specified with +.BR ip (8). +The range for this parameter is [0, UINT32_MAX]. + +For bfifo, it defaults to the txqueuelen multiplied by the interface MTU. +The range for this parameter is [0, UINT32_MAX] bytes. + +Note: The link layer header was considered when counting packets length. + +.SH OUTPUT +The output of +.B tc -s qdisc ls +contains the limit, either in packets or in bytes, and the number of bytes +and packets actually sent. An unsent and dropped packet only appears between braces +and is not counted as 'Sent'. + +In this example, the queue length is 100 packets, 45894 bytes were sent over 681 packets. +No packets were dropped, and as the pfifo queue does not slow down packets, there were also no +overlimits: +.P +.nf +# tc -s qdisc ls dev eth0 +qdisc pfifo 8001: dev eth0 limit 100p + Sent 45894 bytes 681 pkts (dropped 0, overlimits 0) +.fi + +If a backlog occurs, this is displayed as well. +.SH SEE ALSO +.BR tc (8) + +.SH AUTHORS +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru> + +This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-bpf.8 b/man/man8/tc-bpf.8 new file mode 100644 index 0000000..01230ce --- /dev/null +++ b/man/man8/tc-bpf.8 @@ -0,0 +1,986 @@ +.TH "BPF classifier and actions in tc" 8 "18 May 2015" "iproute2" "Linux" +.SH NAME +BPF \- BPF programmable classifier and actions for ingress/egress +queueing disciplines +.SH SYNOPSIS +.SS eBPF classifier (filter) or action: +.B tc filter ... bpf +[ +.B object-file +OBJ_FILE ] [ +.B section +CLS_NAME ] [ +.B export +UDS_FILE ] [ +.B verbose +] [ +.B direct-action +| +.B da +] [ +.B skip_hw +| +.B skip_sw +] [ +.B police +POLICE_SPEC ] [ +.B action +ACTION_SPEC ] [ +.B classid +CLASSID ] +.br +.B tc action ... bpf +[ +.B object-file +OBJ_FILE ] [ +.B section +CLS_NAME ] [ +.B export +UDS_FILE ] [ +.B verbose +] + +.SS cBPF classifier (filter) or action: +.B tc filter ... bpf +[ +.B bytecode-file +BPF_FILE | +.B bytecode +BPF_BYTECODE ] [ +.B police +POLICE_SPEC ] [ +.B action +ACTION_SPEC ] [ +.B classid +CLASSID ] +.br +.B tc action ... bpf +[ +.B bytecode-file +BPF_FILE | +.B bytecode +BPF_BYTECODE ] + +.SH DESCRIPTION + +Extended Berkeley Packet Filter ( +.B eBPF +) and classic Berkeley Packet Filter +(originally known as BPF, for better distinction referred to as +.B cBPF +here) are both available as a fully programmable and highly efficient +classifier and actions. They both offer a minimal instruction set for +implementing small programs which can safely be loaded into the kernel +and thus executed in a tiny virtual machine from kernel space. An in-kernel +verifier guarantees that a specified program always terminates and neither +crashes nor leaks data from the kernel. + +In Linux, it's generally considered that eBPF is the successor of cBPF. +The kernel internally transforms cBPF expressions into eBPF expressions and +executes the latter. Execution of them can be performed in an interpreter +or at setup time, they can be just-in-time compiled (JIT'ed) to run as +native machine code. +.PP +Currently, the eBPF JIT compiler is available for the following architectures: +.IP * 4 +x86_64 (since Linux 3.18) +.PD 0 +.IP * +arm64 (since Linux 3.18) +.IP * +s390 (since Linux 4.1) +.IP * +ppc64 (since Linux 4.8) +.IP * +sparc64 (since Linux 4.12) +.IP * +mips64 (since Linux 4.13) +.IP * +arm32 (since Linux 4.14) +.IP * +x86_32 (since Linux 4.18) +.PD +.PP +Whereas the following architectures have cBPF, but did not (yet) switch to eBPF +JIT support: +.IP * 4 +ppc32 +.PD 0 +.IP * +sparc32 +.IP * +mips32 +.PD +.PP +eBPF's instruction set has similar underlying principles as the cBPF +instruction set, it however is modelled closer to the underlying +architecture to better mimic native instruction sets with the aim to +achieve a better run-time performance. It is designed to be JIT'ed with +a one to one mapping, which can also open up the possibility for compilers +to generate optimized eBPF code through an eBPF backend that performs +almost as fast as natively compiled code. Given that LLVM provides such +an eBPF backend, eBPF programs can therefore easily be programmed in a +subset of the C language. Other than that, eBPF infrastructure also comes +with a construct called "maps". eBPF maps are key/value stores that are +shared between multiple eBPF programs, but also between eBPF programs and +user space applications. + +For the traffic control subsystem, classifier and actions that can be +attached to ingress and egress qdiscs can be written in eBPF or cBPF. The +advantage over other classifier and actions is that eBPF/cBPF provides the +generic framework, while users can implement their highly specialized use +cases efficiently. This means that the classifier or action written that +way will not suffer from feature bloat, and can therefore execute its task +highly efficient. It allows for non-linear classification and even merging +the action part into the classification. Combined with efficient eBPF map +data structures, user space can push new policies like classids into the +kernel without reloading a classifier, or it can gather statistics that +are pushed into one map and use another one for dynamically load balancing +traffic based on the determined load, just to provide a few examples. + +.SH PARAMETERS +.SS object-file +points to an object file that has an executable and linkable format (ELF) +and contains eBPF opcodes and eBPF map definitions. The LLVM compiler +infrastructure with +.B clang(1) +as a C language front end is one project that supports emitting eBPF object +files that can be passed to the eBPF classifier (more details in the +.B EXAMPLES +section). This option is mandatory when an eBPF classifier or action is +to be loaded. + +.SS section +is the name of the ELF section from the object file, where the eBPF +classifier or action resides. By default the section name for the +classifier is called "classifier", and for the action "action". Given +that a single object file can contain multiple classifier and actions, +the corresponding section name needs to be specified, if it differs +from the defaults. + +.SS export +points to a Unix domain socket file. In case the eBPF object file also +contains a section named "maps" with eBPF map specifications, then the +map file descriptors can be handed off via the Unix domain socket to +an eBPF "agent" herding all descriptors after tc lifetime. This can be +some third party application implementing the IPC counterpart for the +import, that uses them for calling into +.B bpf(2) +system call to read out or update eBPF map data from user space, for +example, for monitoring purposes or to push down new policies. + +.SS verbose +if set, it will dump the eBPF verifier output, even if loading the eBPF +program was successful. By default, only on error, the verifier log is +being emitted to the user. + +.SS direct-action | da +instructs eBPF classifier to not invoke external TC actions, instead use the +TC actions return codes (\fBTC_ACT_OK\fR, \fBTC_ACT_SHOT\fR etc.) for +classifiers. + +.SS skip_hw | skip_sw +hardware offload control flags. By default TC will try to offload +filters to hardware if possible. +.B skip_hw +explicitly disables the attempt to offload. +.B skip_sw +forces the offload and disables running the eBPF program in the kernel. +If hardware offload is not possible and this flag was set kernel will +report an error and filter will not be installed at all. + +.SS police +is an optional parameter for an eBPF/cBPF classifier that specifies a +police in +.B tc(1) +which is attached to the classifier, for example, on an ingress qdisc. + +.SS action +is an optional parameter for an eBPF/cBPF classifier that specifies a +subsequent action in +.B tc(1) +which is attached to a classifier. + +.SS classid +.SS flowid +provides the default traffic control class identifier for this eBPF/cBPF +classifier. The default class identifier can also be overwritten by the +return code of the eBPF/cBPF program. A default return code of +.B -1 +specifies the here provided default class identifier to be used. A return +code of the eBPF/cBPF program of 0 implies that no match took place, and +a return code other than these two will override the default classid. This +allows for efficient, non-linear classification with only a single eBPF/cBPF +program as opposed to having multiple individual programs for various class +identifiers which would need to reparse packet contents. + +.SS bytecode +is being used for loading cBPF classifier and actions only. The cBPF bytecode +is directly passed as a text string in the form of +.B \(aqs,c t f k,c t f k,c t f k,...' +, where +.B s +denotes the number of subsequent 4-tuples. One such 4-tuple consists of +.B c t f k +decimals, where +.B c +represents the cBPF opcode, +.B t +the jump true offset target, +.B f +the jump false offset target and +.B k +the immediate constant/literal. There are various tools that generate code +in this loadable format, for example, +.B bpf_asm +that ships with the Linux kernel source tree under +.B tools/net/ +, so it is certainly not expected to hack this by hand. The +.B bytecode +or +.B bytecode-file +option is mandatory when a cBPF classifier or action is to be loaded. + +.SS bytecode-file +also being used to load a cBPF classifier or action. It's effectively the +same as +.B bytecode +only that the cBPF bytecode is not passed directly via command line, but +rather resides in a text file. + +.SH EXAMPLES +.SS eBPF TOOLING +A full blown example including eBPF agent code can be found inside the +iproute2 source package under: +.B examples/bpf/ + +As prerequisites, the kernel needs to have the eBPF system call namely +.B bpf(2) +enabled and ships with +.B cls_bpf +and +.B act_bpf +kernel modules for the traffic control subsystem. To enable eBPF/eBPF JIT +support, depending which of the two the given architecture supports: + +.in +4n +.B echo 1 > /proc/sys/net/core/bpf_jit_enable +.in + +A given restricted C file can be compiled via LLVM as: + +.in +4n +.B clang -O2 -emit-llvm -c bpf.c -o - | llc -march=bpf -filetype=obj -o bpf.o +.in + +The compiler invocation might still simplify in future, so for now, +it's quite handy to alias this construct in one way or another, for +example: +.in +4n +.nf +.sp +__bcc() { + clang -O2 -emit-llvm -c $1 -o - | \\ + llc -march=bpf -filetype=obj -o "`basename $1 .c`.o" +} + +alias bcc=__bcc +.fi +.in + +A minimal, stand-alone unit, which matches on all traffic with the +default classid (return code of -1) looks like: + +.in +4n +.nf +.sp +#include <linux/bpf.h> + +#ifndef __section +# define __section(x) __attribute__((section(x), used)) +#endif + +__section("classifier") int cls_main(struct __sk_buff *skb) +{ + return -1; +} + +char __license[] __section("license") = "GPL"; +.fi +.in + +More examples can be found further below in subsection +.B eBPF PROGRAMMING +as focus here will be on tooling. + +There can be various other sections, for example, also for actions. +Thus, an object file in eBPF can contain multiple entrance points. +Always a specific entrance point, however, must be specified when +configuring with tc. A license must be part of the restricted C code +and the license string syntax is the same as with Linux kernel modules. +The kernel reserves its right that some eBPF helper functions can be +restricted to GPL compatible licenses only, and thus may reject a program +from loading into the kernel when such a license mismatch occurs. + +The resulting object file from the compilation can be inspected with +the usual set of tools that also operate on normal object files, for +example +.B objdump(1) +for inspecting ELF section headers: + +.in +4n +.nf +.sp +objdump -h bpf.o +[...] +3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3 + CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE +4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3 + CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE +5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3 + CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE +6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2 + CONTENTS, ALLOC, LOAD, DATA +7 license 00000004 0000000000000000 0000000000000000 00000988 2**0 + CONTENTS, ALLOC, LOAD, DATA +[...] +.fi +.in + +Adding an eBPF classifier from an object file that contains a classifier +in the default ELF section is trivial (note that instead of "object-file" +also shortcuts such as "obj" can be used): + +.in +4n +.B bcc bpf.c +.br +.B tc filter add dev em1 parent 1: bpf obj bpf.o flowid 1:1 +.in + +In case the classifier resides in ELF section "mycls", then that same +command needs to be invoked as: + +.in +4n +.B tc filter add dev em1 parent 1: bpf obj bpf.o sec mycls flowid 1:1 +.in + +Dumping the classifier configuration will tell the location of the +classifier, in other words that it's from object file "bpf.o" under +section "mycls": + +.in +4n +.B tc filter show dev em1 +.br +.B filter parent 1: protocol all pref 49152 bpf +.br +.B filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[mycls] +.in + +The same program can also be installed on ingress qdisc side as opposed +to egress ... + +.in +4n +.B tc qdisc add dev em1 handle ffff: ingress +.br +.B tc filter add dev em1 parent ffff: bpf obj bpf.o sec mycls flowid ffff:1 +.in + +\&... and again dumped from there: + +.in +4n +.B tc filter show dev em1 parent ffff: +.br +.B filter protocol all pref 49152 bpf +.br +.B filter protocol all pref 49152 bpf handle 0x1 flowid ffff:1 bpf.o:[mycls] +.in + +Attaching a classifier and action on ingress has the restriction that +it doesn't have an actual underlying queueing discipline. What ingress +can do is to classify, mangle, redirect or drop packets. When queueing +is required on ingress side, then ingress must redirect packets to the +.B ifb +device, otherwise policing can be used. Moreover, ingress can be used to +have an early drop point of unwanted packets before they hit upper layers +of the networking stack, perform network accounting with eBPF maps that +could be shared with egress, or have an early mangle and/or redirection +point to different networking devices. + +Multiple eBPF actions and classifier can be placed into a single +object file within various sections. In that case, non-default section +names must be provided, which is the case for both actions in this +example: + +.in +4n +.B tc filter add dev em1 parent 1: bpf obj bpf.o flowid 1:1 \e +.br +.in +25n +.B action bpf obj bpf.o sec action-mark \e +.br +.B action bpf obj bpf.o sec action-rand ok +.in -25n +.in -4n + +The advantage of this is that the classifier and the two actions can +then share eBPF maps with each other, if implemented in the programs. + +In order to access eBPF maps from user space beyond +.B tc(8) +setup lifetime, the ownership can be transferred to an eBPF agent via +Unix domain sockets. There are two possibilities for implementing this: + +.B 1) +implementation of an own eBPF agent that takes care of setting up +the Unix domain socket and implementing the protocol that +.B tc(8) +dictates. A code example of this can be found inside the iproute2 +source package under: +.B examples/bpf/ + +.B 2) +use +.B tc exec +for transferring the eBPF map file descriptors through a Unix domain +socket, and spawning an application such as +.B sh(1) +\&. This approach's advantage is that tc will place the file descriptors +into the environment and thus make them available just like stdin, stdout, +stderr file descriptors, meaning, in case user applications run from within +this fd-owner shell, they can terminate and restart without losing eBPF +maps file descriptors. Example invocation with the previous classifier and +action mixture: + +.in +4n +.B tc exec bpf imp /tmp/bpf +.br +.B tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf flowid 1:1 \e +.br +.in +25n +.B action bpf obj bpf.o sec action-mark \e +.br +.B action bpf obj bpf.o sec action-rand ok +.in -25n +.in -4n + +Assuming that eBPF maps are shared with classifier and actions, it's +enough to export them once, for example, from within the classifier +or action command. tc will setup all eBPF map file descriptors at the +time when the object file is first parsed. + +When a shell has been spawned, the environment will have a couple of +eBPF related variables. BPF_NUM_MAPS provides the total number of maps +that have been transferred over the Unix domain socket. BPF_MAP<X>'s +value is the file descriptor number that can be accessed in eBPF agent +applications, in other words, it can directly be used as the file +descriptor value for the +.B bpf(2) +system call to retrieve or alter eBPF map values. <X> denotes the +identifier of the eBPF map. It corresponds to the +.B id +member of +.B struct bpf_elf_map +\& from the tc eBPF map specification. + +The environment in this example looks as follows: + +.in +4n +.nf +.sp +sh# env | grep BPF + BPF_NUM_MAPS=3 + BPF_MAP1=6 + BPF_MAP0=5 + BPF_MAP2=7 +sh# ls -la /proc/self/fd + [...] + lrwx------. 1 root root 64 Apr 14 16:46 5 -> anon_inode:bpf-map + lrwx------. 1 root root 64 Apr 14 16:46 6 -> anon_inode:bpf-map + lrwx------. 1 root root 64 Apr 14 16:46 7 -> anon_inode:bpf-map +sh# my_bpf_agent +.fi +.in + +eBPF agents are very useful in that they can prepopulate eBPF maps from +user space, monitor statistics via maps and based on that feedback, for +example, rewrite classids in eBPF map values during runtime. Given that eBPF +agents are implemented as normal applications, they can also dynamically +receive traffic control policies from external controllers and thus push +them down into eBPF maps to dynamically adapt to network conditions. Moreover, +eBPF maps can also be shared with other eBPF program types (e.g. tracing), +thus very powerful combination can therefore be implemented. + +.SS eBPF PROGRAMMING + +eBPF classifier and actions are being implemented in restricted C syntax +(in future, there could additionally be new language frontends supported). + +The header file +.B linux/bpf.h +provides eBPF helper functions that can be called from an eBPF program. +This man page will only provide two minimal, stand-alone examples, have a +look at +.B examples/bpf +from the iproute2 source package for a fully fledged flow dissector +example to better demonstrate some of the possibilities with eBPF. + +Supported 32 bit classifier return codes from the C program and their meanings: +.in +4n +.B 0 +, denotes a mismatch +.br +.B -1 +, denotes the default classid configured from the command line +.br +.B else +, everything else will override the default classid to provide a facility for +non-linear matching +.in + +Supported 32 bit action return codes from the C program and their meanings ( +.B linux/pkt_cls.h +): +.in +4n +.B TC_ACT_OK (0) +, will terminate the packet processing pipeline and allows the packet to +proceed +.br +.B TC_ACT_SHOT (2) +, will terminate the packet processing pipeline and drops the packet +.br +.B TC_ACT_UNSPEC (-1) +, will use the default action configured from tc (similarly as returning +.B -1 +from a classifier) +.br +.B TC_ACT_PIPE (3) +, will iterate to the next action, if available +.br +.B TC_ACT_RECLASSIFY (1) +, will terminate the packet processing pipeline and start classification +from the beginning +.br +.B else +, everything else is an unspecified return code +.in + +Both classifier and action return codes are supported in eBPF and cBPF +programs. + +To demonstrate restricted C syntax, a minimal toy classifier example is +provided, which assumes that egress packets, for instance originating +from a container, have previously been marked in interval [0, 255]. The +program keeps statistics on different marks for user space and maps the +classid to the root qdisc with the marking itself as the minor handle: + +.in +4n +.nf +.sp +#include <stdint.h> +#include <asm/types.h> + +#include <linux/bpf.h> +#include <linux/pkt_sched.h> + +#include "helpers.h" + +struct tuple { + long packets; + long bytes; +}; + +#define BPF_MAP_ID_STATS 1 /* agent's map identifier */ +#define BPF_MAX_MARK 256 + +struct bpf_elf_map __section("maps") map_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .id = BPF_MAP_ID_STATS, + .size_key = sizeof(uint32_t), + .size_value = sizeof(struct tuple), + .max_elem = BPF_MAX_MARK, + .pinning = PIN_GLOBAL_NS, +}; + +static inline void cls_update_stats(const struct __sk_buff *skb, + uint32_t mark) +{ + struct tuple *tu; + + tu = bpf_map_lookup_elem(&map_stats, &mark); + if (likely(tu)) { + __sync_fetch_and_add(&tu->packets, 1); + __sync_fetch_and_add(&tu->bytes, skb->len); + } +} + +__section("cls") int cls_main(struct __sk_buff *skb) +{ + uint32_t mark = skb->mark; + + if (unlikely(mark >= BPF_MAX_MARK)) + return 0; + + cls_update_stats(skb, mark); + + return TC_H_MAKE(TC_H_ROOT, mark); +} + +char __license[] __section("license") = "GPL"; +.fi +.in + +Another small example is a port redirector which demuxes destination port +80 into the interval [8080, 8087] steered by RSS, that can then be attached +to ingress qdisc. The exercise of adding the egress counterpart and IPv6 +support is left to the reader: + +.in +4n +.nf +.sp +#include <asm/types.h> +#include <asm/byteorder.h> + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/tcp.h> + +#include "helpers.h" + +static inline void set_tcp_dport(struct __sk_buff *skb, int nh_off, + __u16 old_port, __u16 new_port) +{ + bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check), + old_port, new_port, sizeof(new_port)); + bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, dest), + &new_port, sizeof(new_port), 0); +} + +static inline int lb_do_ipv4(struct __sk_buff *skb, int nh_off) +{ + __u16 dport, dport_new = 8080, off; + __u8 ip_proto, ip_vl; + + ip_proto = load_byte(skb, nh_off + + offsetof(struct iphdr, protocol)); + if (ip_proto != IPPROTO_TCP) + return 0; + + ip_vl = load_byte(skb, nh_off); + if (likely(ip_vl == 0x45)) + nh_off += sizeof(struct iphdr); + else + nh_off += (ip_vl & 0xF) << 2; + + dport = load_half(skb, nh_off + offsetof(struct tcphdr, dest)); + if (dport != 80) + return 0; + + off = skb->queue_mapping & 7; + set_tcp_dport(skb, nh_off - BPF_LL_OFF, __constant_htons(80), + __cpu_to_be16(dport_new + off)); + return -1; +} + +__section("lb") int lb_main(struct __sk_buff *skb) +{ + int ret = 0, nh_off = BPF_LL_OFF + ETH_HLEN; + + if (likely(skb->protocol == __constant_htons(ETH_P_IP))) + ret = lb_do_ipv4(skb, nh_off); + + return ret; +} + +char __license[] __section("license") = "GPL"; +.fi +.in + +The related helper header file +.B helpers.h +in both examples was: + +.in +4n +.nf +.sp +/* Misc helper macros. */ +#define __section(x) __attribute__((section(x), used)) +#define offsetof(x, y) __builtin_offsetof(x, y) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + +/* ELF map definition */ +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; + __u32 inner_id; + __u32 inner_idx; +}; + +/* Some used BPF function calls. */ +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, + int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, + int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; +static void *(*bpf_map_lookup_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_lookup_elem; + +/* Some used BPF intrinsics. */ +unsigned long long load_byte(void *skb, unsigned long long off) + asm ("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, unsigned long long off) + asm ("llvm.bpf.load.half"); +.fi +.in + +Best practice, we recommend to only have a single eBPF classifier loaded +in tc and perform +.B all +necessary matching and mangling from there instead of a list of individual +classifier and separate actions. Just a single classifier tailored for a +given use-case will be most efficient to run. + +.SS eBPF DEBUGGING + +Both tc +.B filter +and +.B action +commands for +.B bpf +support an optional +.B verbose +parameter that can be used to inspect the eBPF verifier log. It is dumped +by default in case of an error. + +In case the eBPF/cBPF JIT compiler has been enabled, it can also be +instructed to emit a debug output of the resulting opcode image into +the kernel log, which can be read via +.B dmesg(1) +: + +.in +4n +.B echo 2 > /proc/sys/net/core/bpf_jit_enable +.in + +The Linux kernel source tree ships additionally under +.B tools/net/ +a small helper called +.B bpf_jit_disasm +that reads out the opcode image dump from the kernel log and dumps the +resulting disassembly: + +.in +4n +.B bpf_jit_disasm -o +.in + +Other than that, the Linux kernel also contains an extensive eBPF/cBPF +test suite module called +.B test_bpf +\&. Upon ... + +.in +4n +.B modprobe test_bpf +.in + +\&... it performs a diversity of test cases and dumps the results into +the kernel log that can be inspected with +.B dmesg(1) +\&. The results can differ depending on whether the JIT compiler is enabled +or not. In case of failed test cases, the module will fail to load. In +such cases, we urge you to file a bug report to the related JIT authors, +Linux kernel and networking mailing lists. + +.SS cBPF + +Although we generally recommend switching to implementing +.B eBPF +classifier and actions, for the sake of completeness, a few words on how to +program in cBPF will be lost here. + +Likewise, the +.B bpf_jit_enable +switch can be enabled as mentioned already. Tooling such as +.B bpf_jit_disasm +is also independent whether eBPF or cBPF code is being loaded. + +Unlike in eBPF, classifier and action are not implemented in restricted C, +but rather in a minimal assembler-like language or with the help of other +tooling. + +The raw interface with tc takes opcodes directly. For example, the most +minimal classifier matching on every packet resulting in the default +classid of 1:1 looks like: + +.in +4n +.B tc filter add dev em1 parent 1: bpf bytecode '1,6 0 0 4294967295,' flowid 1:1 +.in + +The first decimal of the bytecode sequence denotes the number of subsequent +4-tuples of cBPF opcodes. As mentioned, such a 4-tuple consists of +.B c t f k +decimals, where +.B c +represents the cBPF opcode, +.B t +the jump true offset target, +.B f +the jump false offset target and +.B k +the immediate constant/literal. Here, this denotes an unconditional return +from the program with immediate value of -1. + +Thus, for egress classification, Willem de Bruijn implemented a minimal stand-alone +helper tool under the GNU General Public License version 2 for +.B iptables(8) +BPF extension, which abuses the +.B libpcap +internal classic BPF compiler, his code derived here for usage with +.B tc(8) +: + +.in +4n +.nf +.sp +#include <pcap.h> +#include <stdio.h> + +int main(int argc, char **argv) +{ + struct bpf_program prog; + struct bpf_insn *ins; + int i, ret, dlt = DLT_RAW; + + if (argc < 2 || argc > 3) + return 1; + if (argc == 3) { + dlt = pcap_datalink_name_to_val(argv[1]); + if (dlt == -1) + return 1; + } + + ret = pcap_compile_nopcap(-1, dlt, &prog, argv[argc - 1], + 1, PCAP_NETMASK_UNKNOWN); + if (ret) + return 1; + + printf("%d,", prog.bf_len); + ins = prog.bf_insns; + + for (i = 0; i < prog.bf_len - 1; ++ins, ++i) + printf("%u %u %u %u,", ins->code, + ins->jt, ins->jf, ins->k); + printf("%u %u %u %u", + ins->code, ins->jt, ins->jf, ins->k); + + pcap_freecode(&prog); + return 0; +} +.fi +.in + +Given this small helper, any +.B tcpdump(8) +filter expression can be abused as a classifier where a match will +result in the default classid: + +.in +4n +.B bpftool EN10MB 'tcp[tcpflags] & tcp-syn != 0' > /var/bpf/tcp-syn +.br +.B tc filter add dev em1 parent 1: bpf bytecode-file /var/bpf/tcp-syn flowid 1:1 +.in + +Basically, such a minimal generator is equivalent to: + +.in +4n +.B tcpdump -iem1 -ddd 'tcp[tcpflags] & tcp-syn != 0' | tr '\\\\n' ',' > /var/bpf/tcp-syn +.in + +Since +.B libpcap +does not support all Linux' specific cBPF extensions in its compiler, the +Linux kernel also ships under +.B tools/net/ +a minimal BPF assembler called +.B bpf_asm +for providing full control. For detailed syntax and semantics on implementing +such programs by hand, see references under +.B FURTHER READING +\&. + +Trivial toy example in +.B bpf_asm +for classifying IPv4/TCP packets, saved in a text file called +.B foobar +: + +.in +4n +.nf +.sp +ldh [12] +jne #0x800, drop +ldb [23] +jneq #6, drop +ret #-1 +drop: ret #0 +.fi +.in + +Similarly, such a classifier can be loaded as: + +.in +4n +.B bpf_asm foobar > /var/bpf/tcp-syn +.br +.B tc filter add dev em1 parent 1: bpf bytecode-file /var/bpf/tcp-syn flowid 1:1 +.in + +For BPF classifiers, the Linux kernel provides additionally under +.B tools/net/ +a small BPF debugger called +.B bpf_dbg +, which can be used to test a classifier against pcap files, single-step +or add various breakpoints into the classifier program and dump register +contents during runtime. + +Implementing an action in classic BPF is rather limited in the sense that +packet mangling is not supported. Therefore, it's generally recommended to +make the switch to eBPF, whenever possible. + +.SH FURTHER READING +Further and more technical details about the BPF architecture can be found +in the Linux kernel source tree under +.B Documentation/networking/filter.txt +\&. + +Further details on eBPF +.B tc(8) +examples can be found in the iproute2 source +tree under +.B examples/bpf/ +\&. + +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8) +.BR bpf (2) +.BR bpf (4) + +.SH AUTHORS +Manpage written by Daniel Borkmann. + +Please report corrections or improvements to the Linux kernel networking +mailing list: +.B <netdev@vger.kernel.org> diff --git a/man/man8/tc-cake.8 b/man/man8/tc-cake.8 new file mode 100644 index 0000000..ced9ac7 --- /dev/null +++ b/man/man8/tc-cake.8 @@ -0,0 +1,726 @@ +.TH CAKE 8 "19 July 2018" "iproute2" "Linux" +.SH NAME +CAKE \- Common Applications Kept Enhanced (CAKE) +.SH SYNOPSIS +.B tc qdisc ... cake +.br +[ +.BR bandwidth +RATE | +.BR unlimited* +| +.BR autorate-ingress +] +.br +[ +.BR rtt +TIME | +.BR datacentre +| +.BR lan +| +.BR metro +| +.BR regional +| +.BR internet* +| +.BR oceanic +| +.BR satellite +| +.BR interplanetary +] +.br +[ +.BR besteffort +| +.BR diffserv8 +| +.BR diffserv4 +| +.BR diffserv3* +] +.br +[ +.BR flowblind +| +.BR srchost +| +.BR dsthost +| +.BR hosts +| +.BR flows +| +.BR dual-srchost +| +.BR dual-dsthost +| +.BR triple-isolate* +] +.br +[ +.BR nat +| +.BR nonat* +] +.br +[ +.BR wash +| +.BR nowash* +] +.br +[ +.BR split-gso* +| +.BR no-split-gso +] +.br +[ +.BR ack-filter +| +.BR ack-filter-aggressive +| +.BR no-ack-filter* +] +.br +[ +.BR memlimit +LIMIT ] +.br +[ +.BR fwmark +MASK ] +.br +[ +.BR ptm +| +.BR atm +| +.BR noatm* +] +.br +[ +.BR overhead +N | +.BR conservative +| +.BR raw* +] +.br +[ +.BR mpu +N ] +.br +[ +.BR ingress +| +.BR egress* +] +.br +(* marks defaults) + + +.SH DESCRIPTION +CAKE (Common Applications Kept Enhanced) is a shaping-capable queue discipline +which uses both AQM and FQ. It combines COBALT, which is an AQM algorithm +combining Codel and BLUE, a shaper which operates in deficit mode, and a variant +of DRR++ for flow isolation. 8-way set-associative hashing is used to virtually +eliminate hash collisions. Priority queuing is available through a simplified +diffserv implementation. Overhead compensation for various encapsulation +schemes is tightly integrated. + +All settings are optional; the default settings are chosen to be sensible in +most common deployments. Most people will only need to set the +.B bandwidth +parameter to get useful results, but reading the +.B Overhead Compensation +and +.B Round Trip Time +sections is strongly encouraged. + +.SH SHAPER PARAMETERS +CAKE uses a deficit-mode shaper, which does not exhibit the initial burst +typical of token-bucket shapers. It will automatically burst precisely as much +as required to maintain the configured throughput. As such, it is very +straightforward to configure. +.PP +.B unlimited +(default) +.br + No limit on the bandwidth. +.PP +.B bandwidth +RATE +.br + Set the shaper bandwidth. See +.BR tc(8) +or examples below for details of the RATE value. +.PP +.B autorate-ingress +.br + Automatic capacity estimation based on traffic arriving at this qdisc. +This is most likely to be useful with cellular links, which tend to change +quality randomly. A +.B bandwidth +parameter can be used in conjunction to specify an initial estimate. The shaper +will periodically be set to a bandwidth slightly below the estimated rate. This +estimator cannot estimate the bandwidth of links downstream of itself. + +.SH OVERHEAD COMPENSATION PARAMETERS +The size of each packet on the wire may differ from that seen by Linux. The +following parameters allow CAKE to compensate for this difference by internally +considering each packet to be bigger than Linux informs it. To assist users who +are not expert network engineers, keywords have been provided to represent a +number of common link technologies. + +.SS Manual Overhead Specification +.B overhead +BYTES +.br + Adds BYTES to the size of each packet. BYTES may be negative; values +between -64 and 256 (inclusive) are accepted. +.PP +.B mpu +BYTES +.br + Rounds each packet (including overhead) up to a minimum length +BYTES. BYTES may not be negative; values between 0 and 256 (inclusive) +are accepted. +.PP +.B atm +.br + Compensates for ATM cell framing, which is normally found on ADSL links. +This is performed after the +.B overhead +parameter above. ATM uses fixed 53-byte cells, each of which can carry 48 bytes +payload. +.PP +.B ptm +.br + Compensates for PTM encoding, which is normally found on VDSL2 links and +uses a 64b/65b encoding scheme. It is even more efficient to simply +derate the specified shaper bandwidth by a factor of 64/65 or 0.984. See +ITU G.992.3 Annex N and IEEE 802.3 Section 61.3 for details. +.PP +.B noatm +.br + Disables ATM and PTM compensation. + +.SS Failsafe Overhead Keywords +These two keywords are provided for quick-and-dirty setup. Use them if you +can't be bothered to read the rest of this section. +.PP +.B raw +(default) +.br + Turns off all overhead compensation in CAKE. The packet size reported +by Linux will be used directly. +.PP + Other overhead keywords may be added after "raw". The effect of this is +to make the overhead compensation operate relative to the reported packet size, +not the underlying IP packet size. +.PP +.B conservative +.br + Compensates for more overhead than is likely to occur on any +widely-deployed link technology. +.br + Equivalent to +.B overhead 48 atm. + +.SS ADSL Overhead Keywords +Most ADSL modems have a way to check which framing scheme is in use. Often this +is also specified in the settings document provided by the ISP. The keywords in +this section are intended to correspond with these sources of information. All +of them implicitly set the +.B atm +flag. +.PP +.B pppoa-vcmux +.br + Equivalent to +.B overhead 10 atm +.PP +.B pppoa-llc +.br + Equivalent to +.B overhead 14 atm +.PP +.B pppoe-vcmux +.br + Equivalent to +.B overhead 32 atm +.PP +.B pppoe-llcsnap +.br + Equivalent to +.B overhead 40 atm +.PP +.B bridged-vcmux +.br + Equivalent to +.B overhead 24 atm +.PP +.B bridged-llcsnap +.br + Equivalent to +.B overhead 32 atm +.PP +.B ipoa-vcmux +.br + Equivalent to +.B overhead 8 atm +.PP +.B ipoa-llcsnap +.br + Equivalent to +.B overhead 16 atm +.PP +See also the Ethernet Correction Factors section below. + +.SS VDSL2 Overhead Keywords +ATM was dropped from VDSL2 in favour of PTM, which is a much more +straightforward framing scheme. Some ISPs retained PPPoE for compatibility with +their existing back-end systems. +.PP +.B pppoe-ptm +.br + Equivalent to +.B overhead 30 ptm + +.br + PPPoE: 2B PPP + 6B PPPoE + +.br + ETHERNET: 6B dest MAC + 6B src MAC + 2B ethertype + 4B Frame Check Sequence + +.br + PTM: 1B Start of Frame (S) + 1B End of Frame (Ck) + 2B TC-CRC (PTM-FCS) +.br +.PP +.B bridged-ptm +.br + Equivalent to +.B overhead 22 ptm +.br + ETHERNET: 6B dest MAC + 6B src MAC + 2B ethertype + 4B Frame Check Sequence + +.br + PTM: 1B Start of Frame (S) + 1B End of Frame (Ck) + 2B TC-CRC (PTM-FCS) +.br +.PP +See also the Ethernet Correction Factors section below. + +.SS DOCSIS Cable Overhead Keyword +DOCSIS is the universal standard for providing Internet service over cable-TV +infrastructure. + +In this case, the actual on-wire overhead is less important than the packet size +the head-end equipment uses for shaping and metering. This is specified to be +an Ethernet frame including the CRC (aka FCS). +.PP +.B docsis +.br + Equivalent to +.B overhead 18 mpu 64 noatm + +.SS Ethernet Overhead Keywords +.PP +.B ethernet +.br + Accounts for Ethernet's preamble, inter-frame gap, and Frame Check +Sequence. Use this keyword when the bottleneck being shaped for is an +actual Ethernet cable. +.br + Equivalent to +.B overhead 38 mpu 84 noatm +.PP +.B ether-vlan +.br + Adds 4 bytes to the overhead compensation, accounting for an IEEE 802.1Q +VLAN header appended to the Ethernet frame header. NB: Some ISPs use one or +even two of these within PPPoE; this keyword may be repeated as necessary to +express this. + +.SH ROUND TRIP TIME PARAMETERS +Active Queue Management (AQM) consists of embedding congestion signals in the +packet flow, which receivers use to instruct senders to slow down when the queue +is persistently occupied. CAKE uses ECN signalling when available, and packet +drops otherwise, according to a combination of the Codel and BLUE AQM algorithms +called COBALT. + +Very short latencies require a very rapid AQM response to adequately control +latency. However, such a rapid response tends to impair throughput when the +actual RTT is relatively long. CAKE allows specifying the RTT it assumes for +tuning various parameters. Actual RTTs within an order of magnitude of this +will generally work well for both throughput and latency management. + +At the 'lan' setting and below, the time constants are similar in magnitude to +the jitter in the Linux kernel itself, so congestion might be signalled +prematurely. The flows will then become sparse and total throughput reduced, +leaving little or no back-pressure for the fairness logic to work against. Use +the "metro" setting for local lans unless you have a custom kernel. +.PP +.B rtt +TIME +.br + Manually specify an RTT. +.PP +.B datacentre +.br + For extremely high-performance 10GigE+ networks only. Equivalent to +.B rtt 100us. +.PP +.B lan +.br + For pure Ethernet (not Wi-Fi) networks, at home or in the office. Don't +use this when shaping for an Internet access link. Equivalent to +.B rtt 1ms. +.PP +.B metro +.br + For traffic mostly within a single city. Equivalent to +.B rtt 10ms. +.PP +.B regional +.br + For traffic mostly within a European-sized country. Equivalent to +.B rtt 30ms. +.PP +.B internet +(default) +.br + This is suitable for most Internet traffic. Equivalent to +.B rtt 100ms. +.PP +.B oceanic +.br + For Internet traffic with generally above-average latency, such as that +suffered by Australasian residents. Equivalent to +.B rtt 300ms. +.PP +.B satellite +.br + For traffic via geostationary satellites. Equivalent to +.B rtt 1000ms. +.PP +.B interplanetary +.br + So named because Jupiter is about 1 light-hour from Earth. Use this to +(almost) completely disable AQM actions. Equivalent to +.B rtt 3600s. + +.SH FLOW ISOLATION PARAMETERS +With flow isolation enabled, CAKE places packets from different flows into +different queues, each of which carries its own AQM state. Packets from each +queue are then delivered fairly, according to a DRR++ algorithm which minimizes +latency for "sparse" flows. CAKE uses a set-associative hashing algorithm to +minimize flow collisions. + +These keywords specify whether fairness based on source address, destination +address, individual flows, or any combination of those is desired. +.PP +.B flowblind +.br + Disables flow isolation; all traffic passes through a single queue for +each tin. +.PP +.B srchost +.br + Flows are defined only by source address. Could be useful on the egress +path of an ISP backhaul. +.PP +.B dsthost +.br + Flows are defined only by destination address. Could be useful on the +ingress path of an ISP backhaul. +.PP +.B hosts +.br + Flows are defined by source-destination host pairs. This is host +isolation, rather than flow isolation. +.PP +.B flows +.br + Flows are defined by the entire 5-tuple of source address, destination +address, transport protocol, source port and destination port. This is the type +of flow isolation performed by SFQ and fq_codel. +.PP +.B dual-srchost +.br + Flows are defined by the 5-tuple, and fairness is applied first over +source addresses, then over individual flows. Good for use on egress traffic +from a LAN to the internet, where it'll prevent any one LAN host from +monopolising the uplink, regardless of the number of flows they use. +.PP +.B dual-dsthost +.br + Flows are defined by the 5-tuple, and fairness is applied first over +destination addresses, then over individual flows. Good for use on ingress +traffic to a LAN from the internet, where it'll prevent any one LAN host from +monopolising the downlink, regardless of the number of flows they use. +.PP +.B triple-isolate +(default) +.br + Flows are defined by the 5-tuple, and fairness is applied over source +*and* destination addresses intelligently (ie. not merely by host-pairs), and +also over individual flows. Use this if you're not certain whether to use +dual-srchost or dual-dsthost; it'll do both jobs at once, preventing any one +host on *either* side of the link from monopolising it with a large number of +flows. +.PP +.B nat +.br + Instructs Cake to perform a NAT lookup before applying flow-isolation +rules, to determine the true addresses and port numbers of the packet, to +improve fairness between hosts "inside" the NAT. This has no practical effect +in "flowblind" or "flows" modes, or if NAT is performed on a different host. +.PP +.B nonat +(default) +.br + Cake will not perform a NAT lookup. Flow isolation will be performed +using the addresses and port numbers directly visible to the interface Cake is +attached to. + +.SH PRIORITY QUEUE PARAMETERS +CAKE can divide traffic into "tins" based on the Diffserv field. Each tin has +its own independent set of flow-isolation queues, and is serviced based on a WRR +algorithm. To avoid perverse Diffserv marking incentives, tin weights have a +"priority sharing" value when bandwidth used by that tin is below a threshold, +and a lower "bandwidth sharing" value when above. Bandwidth is compared against +the threshold using the same algorithm as the deficit-mode shaper. + +Detailed customisation of tin parameters is not provided. The following presets +perform all necessary tuning, relative to the current shaper bandwidth and RTT +settings. +.PP +.B besteffort +.br + Disables priority queuing by placing all traffic in one tin. +.PP +.B precedence +.br + Enables legacy interpretation of TOS "Precedence" field. Use of this +preset on the modern Internet is firmly discouraged. +.PP +.B diffserv4 +.br + Provides a general-purpose Diffserv implementation with four tins: +.br + Bulk (CS1, LE in kernel v5.9+), 6.25% threshold, generally low priority. +.br + Best Effort (general), 100% threshold. +.br + Video (AF4x, AF3x, CS3, AF2x, CS2, TOS4, TOS1), 50% threshold. +.br + Voice (CS7, CS6, EF, VA, CS5, CS4), 25% threshold. +.PP +.B diffserv3 +(default) +.br + Provides a simple, general-purpose Diffserv implementation with three tins: +.br + Bulk (CS1, LE in kernel v5.9+), 6.25% threshold, generally low priority. +.br + Best Effort (general), 100% threshold. +.br + Voice (CS7, CS6, EF, VA, TOS4), 25% threshold, reduced Codel interval. + +.PP +.B fwmark +MASK +.br + This options turns on fwmark-based overriding of CAKE's tin selection. +If set, the option specifies a bitmask that will be applied to the fwmark +associated with each packet. If the result of this masking is non-zero, the +result will be right-shifted by the number of least-significant unset bits in +the mask value, and the result will be used as a the tin number for that packet. +This can be used to set policies in a firewall script that will override CAKE's +built-in tin selection. + +.SH OTHER PARAMETERS +.B memlimit +LIMIT +.br + Limit the memory consumed by Cake to LIMIT bytes. Note that this does +not translate directly to queue size (so do not size this based on bandwidth +delay product considerations, but rather on worst case acceptable memory +consumption), as there is some overhead in the data structures containing the +packets, especially for small packets. + + By default, the limit is calculated based on the bandwidth and RTT +settings. + +.PP +.B wash + +.br + Traffic entering your diffserv domain is frequently mis-marked in +transit from the perspective of your network, and traffic exiting yours may be +mis-marked from the perspective of the transiting provider. + +Apply the wash option to clear all extra diffserv (but not ECN bits), after +priority queuing has taken place. + +If you are shaping inbound, and cannot trust the diffserv markings (as is the +case for Comcast Cable, among others), it is best to use a single queue +"besteffort" mode with wash. + +.PP +.B split-gso + +.br + This option controls whether CAKE will split General Segmentation +Offload (GSO) super-packets into their on-the-wire components and +dequeue them individually. + +.br +Super-packets are created by the networking stack to improve efficiency. +However, because they are larger they take longer to dequeue, which +translates to higher latency for competing flows, especially at lower +bandwidths. CAKE defaults to splitting GSO packets to achieve the lowest +possible latency. At link speeds higher than 10 Gbps, setting the +no-split-gso parameter can increase the maximum achievable throughput by +retaining the full GSO packets. + +.SH OVERRIDING CLASSIFICATION WITH TC FILTERS + +CAKE supports overriding of its internal classification of packets through the +tc filter mechanism. Packets can be assigned to different priority tins by +setting the +.B priority +field on the skb, and the flow hashing can be overridden by setting the +.B classid +parameter. + +.PP +.B Tin override + +.br + To assign a priority tin, the major number of the priority field needs +to match the qdisc handle of the cake instance; if it does, the minor number +will be interpreted as the tin index. For example, to classify all ICMP packets +as 'bulk', the following filter can be used: + +.br + # tc qdisc replace dev eth0 handle 1: root cake diffserv3 + # tc filter add dev eth0 parent 1: protocol ip prio 1 \\ + u32 match icmp type 0 0 action skbedit priority 1:1 + +.PP +.B Flow hash override + +.br + To override flow hashing, the classid can be set. CAKE will interpret +the major number of the classid as the host hash used in host isolation mode, +and the minor number as the flow hash used for flow-based queueing. One or both +of those can be set, and will be used if the relevant flow isolation parameter +is set (i.e., the major number will be ignored if CAKE is not configured in +hosts mode, and the minor number will be ignored if CAKE is not configured in +flows mode). + +.br +This example will assign all ICMP packets to the first queue: + +.br + # tc qdisc replace dev eth0 handle 1: root cake + # tc filter add dev eth0 parent 1: protocol ip prio 1 \\ + u32 match icmp type 0 0 classid 0:1 + +.br +If only one of the host and flow overrides is set, CAKE will compute the other +hash from the packet as normal. Note, however, that the host isolation mode +works by assigning a host ID to the flow queue; so if overriding both host and +flow, the same flow cannot have more than one host assigned. In addition, it is +not possible to assign different source and destination host IDs through the +override mechanism; if a host ID is assigned, it will be used as both source and +destination host. + + + +.SH EXAMPLES +# tc qdisc delete root dev eth0 +.br +# tc qdisc add root dev eth0 cake bandwidth 100Mbit ethernet +.br +# tc -s qdisc show dev eth0 +.br +qdisc cake 1: root refcnt 2 bandwidth 100Mbit diffserv3 triple-isolate rtt 100.0ms noatm overhead 38 mpu 84 + Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 + memory used: 0b of 5000000b + capacity estimate: 100Mbit + min/max network layer size: 65535 / 0 + min/max overhead-adjusted size: 65535 / 0 + average network hdr offset: 0 + + Bulk Best Effort Voice + thresh 6250Kbit 100Mbit 25Mbit + target 5.0ms 5.0ms 5.0ms + interval 100.0ms 100.0ms 100.0ms + pk_delay 0us 0us 0us + av_delay 0us 0us 0us + sp_delay 0us 0us 0us + pkts 0 0 0 + bytes 0 0 0 + way_inds 0 0 0 + way_miss 0 0 0 + way_cols 0 0 0 + drops 0 0 0 + marks 0 0 0 + ack_drop 0 0 0 + sp_flows 0 0 0 + bk_flows 0 0 0 + un_flows 0 0 0 + max_len 0 0 0 + quantum 300 1514 762 + +After some use: +.br +# tc -s qdisc show dev eth0 + +qdisc cake 1: root refcnt 2 bandwidth 100Mbit diffserv3 triple-isolate rtt 100.0ms noatm overhead 38 mpu 84 + Sent 44709231 bytes 31931 pkt (dropped 45, overlimits 93782 requeues 0) + backlog 33308b 22p requeues 0 + memory used: 292352b of 5000000b + capacity estimate: 100Mbit + min/max network layer size: 28 / 1500 + min/max overhead-adjusted size: 84 / 1538 + average network hdr offset: 14 + + Bulk Best Effort Voice + thresh 6250Kbit 100Mbit 25Mbit + target 5.0ms 5.0ms 5.0ms + interval 100.0ms 100.0ms 100.0ms + pk_delay 8.7ms 6.9ms 5.0ms + av_delay 4.9ms 5.3ms 3.8ms + sp_delay 727us 1.4ms 511us + pkts 2590 21271 8137 + bytes 3081804 30302659 11426206 + way_inds 0 46 0 + way_miss 3 17 4 + way_cols 0 0 0 + drops 20 15 10 + marks 0 0 0 + ack_drop 0 0 0 + sp_flows 2 4 1 + bk_flows 1 2 1 + un_flows 0 0 0 + max_len 1514 1514 1514 + quantum 300 1514 762 + +.SH SEE ALSO +.BR tc (8), +.BR tc-codel (8), +.BR tc-fq_codel (8), +.BR tc-htb (8) + +.SH AUTHORS +Cake's principal author is Jonathan Morton, with contributions from +Tony Ambardar, Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, +Sebastian Moeller, Ryan Mounce, Dean Scarff, Nils Andreas Svee, and Dave Täht. + +This manual page was written by Loganaden Velvindron. Please report corrections +to the Linux Networking mailing list <netdev@vger.kernel.org>. diff --git a/man/man8/tc-cbs.8 b/man/man8/tc-cbs.8 new file mode 100644 index 0000000..ad1d882 --- /dev/null +++ b/man/man8/tc-cbs.8 @@ -0,0 +1,124 @@ +.TH CBS 8 "18 Sept 2017" "iproute2" "Linux" +.SH NAME +CBS \- Credit Based Shaper (CBS) Qdisc +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B parent +classid +.B [ handle +major: +.B ] cbs idleslope +idleslope +.B sendslope +sendslope +.B hicredit +hicredit +.B locredit +locredit +.B [ offload +0|1 +.B ] + +.SH DESCRIPTION +The CBS (Credit Based Shaper) qdisc implements the shaping algorithm +defined by the IEEE 802.1Q-2014 Section 8.6.8.2, which applies a well +defined rate limiting method to the traffic. + +This queueing discipline is intended to be used by TSN (Time Sensitive +Networking) applications, the CBS parameters are derived directly by +what is described by the Annex L of the IEEE 802.1Q-2014 +Specification. The algorithm and how it affects the latency are +detailed there. + +CBS is meant to be installed under another qdisc that maps packet +flows to traffic classes, one example is +.BR mqprio(8). + +.SH PARAMETERS +.TP +idleslope +Idleslope is the rate of credits that is accumulated (in kilobits per +second) when there is at least one packet waiting for transmission. +Packets are transmitted when the current value of credits is equal or +greater than zero. When there is no packet to be transmitted the +amount of credits is set to zero. This is the main tunable of the CBS +algorithm and represents the bandwidth that will be consumed. +Note that when calculating idleslope, the entire packet size must be +considered, including headers from all layers (i.e. MAC framing and any +overhead from the physical layer), as described by IEEE 802.1Q-2014 +section 34.4. + +As an example, for an ethernet frame carrying 284 bytes of payload, +and with no VLAN tags, you must add 14 bytes for the Ethernet headers, +4 bytes for the Frame check sequence (CRC), and 20 bytes for the L1 +overhead: 12 bytes of interpacket gap, 7 bytes of preamble and 1 byte +of start of frame delimiter. That results in 322 bytes for the total +packet size, which is then used for calculating the idleslope. + +.TP +sendslope +Sendslope is the rate of credits that is depleted (it should be a +negative number of kilobits per second) when a transmission is +occurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section +8.6.8.2 item g): + +sendslope = idleslope - port_transmit_rate + +.TP +hicredit +Hicredit defines the maximum amount of credits (in bytes) that can be +accumulated. Hicredit depends on the characteristics of interfering +traffic, 'max_interference_size' is the maximum size of any burst of +traffic that can delay the transmission of a frame that is available +for transmission for this traffic class, (IEEE 802.1Q-2014 Annex L, +Equation L-3): + +hicredit = max_interference_size * (idleslope / port_transmit_rate) + +.TP +locredit +Locredit is the minimum amount of credits that can be reached. It is a +function of the traffic flowing through this qdisc (IEEE 802.1Q-2014 +Annex L, Equation L-2): + +locredit = max_frame_size * (sendslope / port_transmit_rate) + +.TP +offload +When +.B offload +is 1, +.BR cbs(8) +will try to configure the network interface so the CBS algorithm runs +in the controller. The default is 0. + +.SH EXAMPLES + +CBS is used to enforce a Quality of Service by limiting the data rate +of a traffic class, to separate packets into traffic classes the user +may choose +.BR mqprio(8), +and configure it like this: + +.EX +# tc qdisc add dev eth0 handle 100: parent root mqprio num_tc 3 \\ + map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ + queues 1@0 1@1 2@2 \\ + hw 0 +.EE +.P +To replace the current queuing disciple by CBS in the current queueing +discipline connected to traffic class number 0, issue: +.P +.EX +# tc qdisc replace dev eth0 parent 100:4 cbs \\ + locredit -1470 hicredit 30 sendslope -980000 idleslope 20000 +.EE + +These values are obtained from the following parameters, idleslope is +20mbit/s, the transmission rate is 1Gbit/s and the maximum interfering +frame size is 1500 bytes. + +.SH AUTHORS +Vinicius Costa Gomes <vinicius.gomes@intel.com> diff --git a/man/man8/tc-cgroup.8 b/man/man8/tc-cgroup.8 new file mode 100644 index 0000000..2bea7d4 --- /dev/null +++ b/man/man8/tc-cgroup.8 @@ -0,0 +1,80 @@ +.TH "Cgroup classifier in tc" 8 " 21 Oct 2015" "iproute2" "Linux" + +.SH NAME +cgroup \- control group based traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " cgroup " [ " match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +This filter serves as a hint to +.B tc +that the assigned class ID of the net_cls control group the process the packet +originates from belongs to should be used for classification. Obviously, it is +useful for locally generated packets only. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH EXAMPLES +In order to use this filter, a net_cls control group has to be created first and +class as well as process ID(s) assigned to it. The following creates a net_cls +cgroup named "foobar": + +.RS +.EX +modprobe cls_cgroup +mkdir /sys/fs/cgroup/net_cls +mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls +mkdir /sys/fs/cgroup/net_cls/foobar +.EE +.RE + +To assign a class ID to the created cgroup, a file named +.I net_cls.classid +has to be created which contains the class ID to be assigned as a hexadecimal, +64bit wide number. The upper 32bits are reserved for the major handle, the +remaining hold the minor. So a class ID of e.g. +.B ff:be +has to be written like so: +.B 0xff00be +(leading zeroes may be omitted). To continue the above example, the following +assigns class ID 1:2 to foobar cgroup: + +.RS +.EX +echo 0x10002 > /sys/fs/cgroup/net_cls/foobar/net_cls.classid +.EE +.RE + +Finally some PIDs can be assigned to the given cgroup: + +.RS +.EX +echo 1234 > /sys/fs/cgroup/net_cls/foobar/tasks +echo 5678 > /sys/fs/cgroup/net_cls/foobar/tasks +.EE +.RE + +Now by simply attaching a +.B cgroup +filter to a +.B qdisc +makes packets from PIDs 1234 and 5678 be pushed into class 1:2. + +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8), +.br +the file +.I Documentation/cgroups/net_cls.txt +of the Linux kernel tree diff --git a/man/man8/tc-choke.8 b/man/man8/tc-choke.8 new file mode 100644 index 0000000..1916a3d --- /dev/null +++ b/man/man8/tc-choke.8 @@ -0,0 +1,63 @@ +.TH TC 8 "August 2011" "iproute2" "Linux" +.SH NAME +choke \- choose and keep scheduler +.SH SYNOPSIS +.B tc qdisc ... choke +.B limit +packets +.B min +packets +.B max +packets +.B avpkt +bytes +.B burst +packets +.B [ ecn ] [ bandwidth +rate +.B ] probability +chance + +.SH DESCRIPTION + +CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for unresponsive flows) +is a classless qdisc designed to both identify and penalize flows that monopolize the +queue. CHOKe is a variation of RED, and the configuration is similar to RED. + +.SH ALGORITHM +Once the queue hits a certain average length, a random packet is drawn from the +queue. If both the to-be-queued and the drawn packet belong to the same flow, +both packets are dropped. Otherwise, if the queue length is still below the maximum length, +the new packet has a configurable chance of being marked (which may mean dropped). +If the queue length exceeds +.BR max , +the new packet will always be marked (or dropped). +If the queue length exceeds +.BR limit , +the new packet is always dropped. + +The marking probability computation is the same as used by the RED qdisc. + +.SH PARAMETERS +The parameters are the same as for RED, except that RED uses bytes whereas choke +counts packets. See +.BR tc-red (8) +for a description. + +.SH SOURCE +.TP +o +R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless +Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", +IEEE INFOCOM, 2000. +.TP +o +A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial +Characteristics", IEEE/ACM Transactions on Networking, 2004 + +.SH SEE ALSO +.BR tc (8), +.BR tc-red (8) + +.SH AUTHOR +sched_choke was contributed by Stephen Hemminger. diff --git a/man/man8/tc-codel.8 b/man/man8/tc-codel.8 new file mode 100644 index 0000000..e538e94 --- /dev/null +++ b/man/man8/tc-codel.8 @@ -0,0 +1,122 @@ +.TH CoDel 8 "23 May 2012" "iproute2" "Linux" +.SH NAME +CoDel \- Controlled-Delay Active Queue Management algorithm +.SH SYNOPSIS +.B tc qdisc ... codel +[ +.B limit +PACKETS ] [ +.B target +TIME ] [ +.B interval +TIME ] [ +.B ecn +| +.B noecn +] [ +.B ce_threshold +TIME ] + +.SH DESCRIPTION +CoDel (pronounced "coddle") is an adaptive "no-knobs" active queue management +algorithm (AQM) scheme that was developed to address the shortcomings of +RED and its variants. It was developed with the following goals +in mind: + o It should be parameterless. + o It should keep delays low while permitting bursts of traffic. + o It should control delay. + o It should adapt dynamically to changing link rates with no impact on +utilization. + o It should be simple and efficient and should scale from simple to +complex routers. + +.SH ALGORITHM +CoDel comes with three major innovations. Instead of using queue size or queue +average, it uses the local minimum queue as a measure of the standing/persistent queue. +Second, it uses a single state-tracking variable of the minimum delay to see where it +is relative to the standing queue delay. Third, instead of measuring queue size +in bytes or packets, it is measured in packet-sojourn time in the queue. + +CoDel measures the minimum local queue delay (i.e. standing queue delay) and +compares it to the value of the given acceptable queue delay +.B target. +As long as the minimum queue delay is less than +.B target +or the buffer contains fewer than MTU worth of bytes, packets are not dropped. +Codel enters a dropping mode when the minimum queue delay has exceeded +.B target +for a time greater than +.B interval. +In this mode, packets are dropped at different drop times which is set by a +control law. The control law ensures that the packet drops cause a linear change +in the throughput. Once the minimum delay goes below +.B target, +packets are no longer dropped. + +Additional details can be found in the paper cited below. + +.SH PARAMETERS +.SS limit +hard limit on the real queue size. When this limit is reached, incoming packets +are dropped. If the value is lowered, packets are dropped so that the new limit is +met. Default is 1000 packets. + +.SS target +is the acceptable minimum standing/persistent queue delay. This minimum delay +is identified by tracking the local minimum queue delay that packets experience. +Default and recommended value is 5ms. + +.SS interval +is used to ensure that the measured minimum delay does not become too stale. The +minimum delay must be experienced in the last epoch of length +.B interval. +It should be set on the order of the worst-case RTT through the bottleneck to +give endpoints sufficient time to react. Default value is 100ms. + +.SS ecn | noecn +can be used to mark packets instead of dropping them. If +.B ecn +has been enabled, +.B noecn +can be used to turn it off and vice-a-versa. By default, +.B ecn +is turned off. + +.SS ce_threshold +sets a threshold above which all packets are marked with ECN Congestion +Experienced. This is useful for DCTCP-style congestion control algorithms that +require marking at very shallow queueing thresholds. + + +.SH EXAMPLES + # tc qdisc add dev eth0 root codel + # tc -s qdisc show + qdisc codel 801b: dev eth0 root refcnt 2 limit 1000p target 5.0ms +interval 100.0ms + Sent 245801662 bytes 275853 pkt (dropped 0, overlimits 0 requeues 24) + backlog 0b 0p requeues 24 + count 0 lastcount 0 ldelay 2us drop_next 0us + maxpacket 7306 ecn_mark 0 drop_overlimit 0 + + # tc qdisc add dev eth0 root codel limit 100 target 4ms interval 30ms ecn + # tc -s qdisc show + qdisc codel 801c: dev eth0 root refcnt 2 limit 100p target 4.0ms +interval 30.0ms ecn + Sent 237573074 bytes 268561 pkt (dropped 0, overlimits 0 requeues 5) + backlog 0b 0p requeues 5 + count 0 lastcount 0 ldelay 76us drop_next 0us + maxpacket 2962 ecn_mark 0 drop_overlimit 0 + + +.SH SEE ALSO +.BR tc (8), +.BR tc-red (8) + +.SH SOURCES +o Kathleen Nichols and Van Jacobson, "Controlling Queue Delay", ACM Queue, +http://queue.acm.org/detail.cfm?id=2209336 + +.SH AUTHORS +CoDel was implemented by Eric Dumazet and David Taht. This manpage was written +by Vijay Subramanian. Please reports corrections to the Linux Networking +mailing list <netdev@vger.kernel.org>. diff --git a/man/man8/tc-connmark.8 b/man/man8/tc-connmark.8 new file mode 100644 index 0000000..44f29f5 --- /dev/null +++ b/man/man8/tc-connmark.8 @@ -0,0 +1,55 @@ +.TH "Connmark retriever action in tc" 8 "11 Jan 2016" "iproute2" "Linux" + +.SH NAME +connmark - netfilter connmark retriever action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action connmark " [ " zone" +.IR u16_zone_index " ] [ " CONTROL " ] [" +.BI index " u32_index " +] + +.ti -8 +.IR CONTROL " := { " reclassify " | " pipe " | " drop " | " continue " | " ok " }" +.SH DESCRIPTION +The connmark action is used to restore the connection's mark value into the +packet's fwmark. +.SH OPTIONS +.TP +.BI zone " u16_zone_index" +Specify the conntrack zone when doing conntrack lookups for packets. +.I u16_zone_index +is a 16bit unsigned decimal value. +.TP +.I CONTROL +How to continue after executing this action. +.RS +.TP +.B reclassify +Restarts classification by jumping back to the first filter attached to this +action's parent. +.TP +.B pipe +Continue with the next action, this is the default. +.TP +.B drop +.TQ +.B shot +Packet will be dropped without running further actions. +.TP +.B continue +Continue classification with next filter in line. +.TP +.B pass +Return to calling qdisc for packet processing. This ends the classification +process. +.RE +.TP +.BI index " u32_index " +Specify an index for this action in order to being able to identify it in later +commands. +.I u32_index +is a 32bit unsigned decimal value. +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-csum.8 b/man/man8/tc-csum.8 new file mode 100644 index 0000000..65724b8 --- /dev/null +++ b/man/man8/tc-csum.8 @@ -0,0 +1,72 @@ +.TH "Checksum action in tc" 8 "11 Jan 2015" "iproute2" "Linux" + +.SH NAME +csum - checksum update action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action csum" +.I UPDATE + +.ti -8 +.IR UPDATE " := " TARGET " [ " UPDATE " ]" + +.ti -8 +.IR TARGET " := { " +.BR ip4h " |" +.BR icmp " |" +.BR igmp " |" +.BR tcp " |" +.BR udp " |" +.BR udplite " |" +.BR sctp " |" +.IR SWEETS " }" + +.ti -8 +.IR SWEETS " := { " +.BR and " | " or " | " + " }" +.SH DESCRIPTION +The +.B csum +action triggers checksum recalculation of specified packet headers. It is +commonly used to fix incorrect checksums after the +.B pedit +action has modified the packet content. +.SH OPTIONS +.TP +.I TARGET +Specify which headers to update: IPv4 header +.RB ( ip4h ), +ICMP header +.RB ( icmp ), +IGMP header +.RB ( igmp ), +TCP header +.RB ( tcp ), +UDP header +.RB ( udp ), +UDPLite header +.RB ( udplite ") or" +SCTP header +.RB ( sctp ). +.TP +.B SWEETS +These are merely syntactic sugar and ignored internally. +.SH EXAMPLES +The following performs stateless NAT for incoming packets from 192.0.2.100 to +new destination 198.51.100.1. Assuming these are UDP +packets, both IP and UDP checksums have to be recalculated: + +.RS +.EX +# tc qdisc add dev eth0 ingress handle ffff: +# tc filter add dev eth0 prio 1 protocol ip parent ffff: \\ + u32 match ip src 192.0.2.100/32 flowid :1 \\ + action pedit munge ip dst set 198.51.100.1 pipe \\ + csum ip and udp +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-pedit (8) diff --git a/man/man8/tc-ct.8 b/man/man8/tc-ct.8 new file mode 100644 index 0000000..2fb81ca --- /dev/null +++ b/man/man8/tc-ct.8 @@ -0,0 +1,107 @@ +.TH "ct action in tc" 8 "14 May 2020" "iproute2" "Linux" +.SH NAME +ct \- tc connection tracking action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR "tc ... action ct commit [ force ] [ zone " +.IR ZONE +.BR "] [ mark " +.IR MASKED_MARK +.BR "] [ label " +.IR MASKED_LABEL +.BR "] [ nat " +.IR NAT_SPEC +.BR "]" + +.ti -8 +.BR "tc ... action ct [ nat ] [ zone " +.IR ZONE +.BR "]" + +.ti -8 +.BR "tc ... action ct clear" + +.SH DESCRIPTION +The ct action is a tc action for sending packets and interacting with the netfilter conntrack module. + +It can (as shown in the synopsis, in order): + +Send the packet to conntrack, and commit the connection, while configuring +a 32bit mark, 128bit label, and src/dst nat. + +Send the packet to conntrack, which will mark the packet with the connection's state and +configured metadata (mark/label), and execute previous configured nat. + +Clear the packet's of previous connection tracking state. + +.SH OPTIONS +.TP +.BI zone " ZONE" +Specify a conntrack zone number on which to send the packet to conntrack. +.TP +.BI mark " MASKED_MARK" +Specify a masked 32bit mark to set for the connection (only valid with commit). +.TP +.BI label " MASKED_LABEL" +Specify a masked 128bit label to set for the connection (only valid with commit). +.TP +.BI nat " NAT_SPEC" +.BI Where " NAT_SPEC " ":= {src|dst} addr" " addr1" "[-" "addr2" "] [port " "port1" "[-" "port2" "]]" + +Specify src/dst and range of nat to configure for the connection (only valid with commit). +.RS +.TP +src/dst - configure src or dst nat +.TP +.BI "" "addr1" "/" "addr2" " - IPv4/IPv6 addresses" +.TP +.BI "" "port1" "/" "port2" " - Port numbers" +.RE +.TP +.BI nat +Restore any previous configured nat. +.TP +.BI clear +Remove any conntrack state and metadata (mark/label) from the packet (must only option specified). +.TP +.BI force +Forces conntrack direction for a previously committed connections, so that current direction will become the original direction (only valid with commit). + +.SH EXAMPLES +Example showing natted firewall in conntrack zone 2, and conntrack mark usage: +.EX + +#Add ingress qdisc on eth0 and eth1 interfaces +.nf +$ tc qdisc add dev eth0 ingress +$ tc qdisc add dev eth1 ingress + +#Setup filters on eth0, allowing opening new connections in zone 2, and doing src nat + mark for each new connection +$ tc filter add dev eth0 ingress prio 1 chain 0 proto ip flower ip_proto tcp ct_state -trk \\ +action ct zone 2 pipe action goto chain 2 +$ tc filter add dev eth0 ingress prio 1 chain 2 proto ip flower ct_state +trk+new \\ +action ct zone 2 commit mark 0xbb nat src addr 5.5.5.7 pipe action mirred egress redirect dev eth1 +$ tc filter add dev eth0 ingress prio 1 chain 2 proto ip flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \\ +action ct nat pipe action mirred egress redirect dev eth1 + +#Setup filters on eth1, allowing only established connections of zone 2 through, and reverse nat (dst nat in this case) +$ tc filter add dev eth1 ingress prio 1 chain 0 proto ip flower ip_proto tcp ct_state -trk \\ +action ct zone 2 pipe action goto chain 1 +$ tc filter add dev eth1 ingress prio 1 chain 1 proto ip flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \\ +action ct nat pipe action mirred egress redirect dev eth0 +.fi + +.EE + +.RE +.SH SEE ALSO +.BR tc (8), +.BR tc-flower (8) +.BR tc-mirred (8) +.SH AUTHORS +Paul Blakey <paulb@mellanox.com> + +Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + +Yossi Kuperman <yossiku@mellanox.com> diff --git a/man/man8/tc-ctinfo.8 b/man/man8/tc-ctinfo.8 new file mode 100644 index 0000000..efa2eec --- /dev/null +++ b/man/man8/tc-ctinfo.8 @@ -0,0 +1,171 @@ +.TH "ctinfo action in tc" 8 "4 Jun 2019" "iproute2" "Linux" +.SH NAME +ctinfo \- tc connmark processing action +.SH SYNOPSIS +.B tc ... action ctinfo +[ +.B dscp +MASK [STATEMASK] ] [ +.B cpmark +[MASK] ] [ +.B zone +ZONE ] [ +.B CONTROL +] [ +.B index +<INDEX> +] + +.SH DESCRIPTION +CTINFO (Conntrack Information) is a tc action for retrieving data from +conntrack marks into various fields. At present it has two independent +processing modes which may be viewed as sub-functions. + +DSCP mode copies a DSCP stored in conntrack's connmark into the IPv4/v6 diffserv +field. The copying may conditionally occur based on a flag also stored in the +connmark. DSCP mode was designed to assist in restoring packet classifications on +ingress, classifications which may then be used by qdiscs such as CAKE. It may be +used in any circumstance where ingress classification needs to be maintained across +links that otherwise bleach or remap according to their own policies. + +CPMARK (copymark) mode copies the conntrack connmark into the packet's mark field. Without +additional parameters it is functionally completely equivalent to the existing +connmark action. An optional mask may be specified to mask which bits of the +connmark are restored. This may be useful when DSCP and CPMARK modes are combined. + +Simple statistics (tc -s) on DSCP restores and CPMARK copies are maintained where values for +set indicate a count of packets altered for that mode. DSCP includes an error count +where the destination packet's diffserv field was unwriteable. +.SH PARAMETERS +.SS DSCP mode parameters: +.IP mask +A mask of 6 contiguous bits indicating where the DSCP value is located in the 32 bit +conntrack mark field. A mask must be provided for this mode. mask is a 32 bit +unsigned value. +.IP statemask +A mask of at least 1 bit indicating where a conditional restore flag is located in the +32 bit conntrack mark field. The statemask bit/s must NOT overlap the mask bits. The +DSCP will be restored if the conntrack mark logically ANDed with the statemask yields +a non-zero result. statemask is an optional unsigned 32 bit value. +.SS CPMARK mode parameters: +.IP mask +Store the logically ANDed result of conntrack mark and mask into the packet's mark +field. Default is 0xffffffff i.e. the whole mark field. mask is an optional unsigned 32 bit +value +.SS Overall action parameters: +.IP zone +Specify the conntrack zone when doing conntrack lookups for packets. +zone is a 16bit unsigned decimal value. +Default is 0. +.IP CONTROL +The following keywords allow one to control how the tree of qdisc, classes, +filters and actions is further traversed after this action. +.RS +.TP +.B reclassify +Restart with the first filter in the current list. +.TP +.B pipe +Continue with the next action attached to the same filter. +.TP +.B drop +Drop the packet. +.TP +.B shot +synonym for +.B drop +.TP +.B continue +Continue classification with the next filter in line. +.TP +.B pass +Finish classification process and return to calling qdisc for further packet +processing. This is the default. +.RE +.IP index +Specify an index for this action in order to being able to identify it in later +commands. index is a 32bit unsigned decimal value. +.SH EXAMPLES +Example showing conditional restoration of DSCP on ingress via an IFB +.RS +.EX + +#Set up the IFB interface +.br +tc qdisc add dev ifb4eth0 handle ffff: ingress + +#Put CAKE qdisc on it +.br +tc qdisc add dev ifb4eth0 root cake bandwidth 40mbit + +#Set interface UP +.br +ip link set dev ifb4eth0 up + +#Add 2 actions, ctinfo to restore dscp & mirred to redirect the packets to IFB +.br +tc filter add dev eth0 parent ffff: protocol all prio 10 u32 \\ + match u32 0 0 flowid 1:1 action \\ + ctinfo dscp 0xfc000000 0x01000000 \\ + mirred egress redirect dev ifb4eth0 + +tc -s qdisc show dev eth0 ingress + + filter parent ffff: protocol all pref 10 u32 chain 0 + filter parent ffff: protocol all pref 10 u32 chain 0 fh 800: ht divisor 1 + filter parent ffff: protocol all pref 10 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 not_in_hw + match 00000000/00000000 at 0 + action order 1: ctinfo zone 0 pipe + index 2 ref 1 bind 1 dscp 0xfc000000 0x01000000 installed 72 sec used 0 sec DSCP set 1333 error 0 CPMARK set 0 + Action statistics: + Sent 658484 bytes 1833 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 + + action order 2: mirred (Egress Redirect to device ifb4eth0) stolen + index 1 ref 1 bind 1 installed 72 sec used 0 sec + Action statistics: + Sent 658484 bytes 1833 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 +.EE +.RE + +Example showing conditional restoration of DSCP on egress + +This may appear nonsensical since iptables marking of egress packets is easy +to achieve, however the iptables flow classification rules may be extensive +and so some sort of set once and forget may be useful especially on cpu +constrained devices. +.RS +.EX + +# Send unmarked connections to a marking chain which needs to store a DSCP +and set statemask bit in the connmark +.br +iptables -t mangle -A POSTROUTING -o eth0 -m connmark \\ + --mark 0x00000000/0x01000000 -g CLASS_MARKING_CHAIN + +# Apply marked DSCP to the packets +.br +tc filter add dev eth0 protocol all prio 10 u32 \\ + match u32 0 0 flowid 1:1 action \\ + ctinfo dscp 0xfc000000 0x01000000 + +tc -s filter show dev eth0 + filter parent 800e: protocol all pref 10 u32 chain 0 + filter parent 800e: protocol all pref 10 u32 chain 0 fh 800: ht divisor 1 + filter parent 800e: protocol all pref 10 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 not_in_hw + match 00000000/00000000 at 0 + action order 1: ctinfo zone 0 pipe + index 1 ref 1 bind 1 dscp 0xfc000000 0x01000000 installed 7414 sec used 0 sec DSCP set 53404 error 0 CPMARK set 0 + Action statistics: + Sent 32890260 bytes 120441 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 +.br +.RE +.SH SEE ALSO +.BR tc (8), +.BR tc-cake (8) +.BR tc-connmark (8) +.BR tc-mirred (8) +.SH AUTHORS +ctinfo was written by Kevin Darbyshire-Bryant. diff --git a/man/man8/tc-drr.8 b/man/man8/tc-drr.8 new file mode 100644 index 0000000..2fea4ee --- /dev/null +++ b/man/man8/tc-drr.8 @@ -0,0 +1,94 @@ +.TH TC 8 "January 2010" "iproute2" "Linux" +.SH NAME +drr \- deficit round robin scheduler +.SH SYNOPSIS +.B tc qdisc ... add drr +.B [ quantum +bytes +.B ] + +.SH DESCRIPTION + +The Deficit Round Robin Scheduler is a classful queuing discipline as +a more flexible replacement for Stochastic Fairness Queuing. + +Unlike SFQ, there are no built-in queues \-\- you need to add classes +and then set up filters to classify packets accordingly. +This can be useful e.g. for using RED qdiscs with different settings for particular +traffic. There is no default class \-\- if a packet cannot be classified, +it is dropped. + +.SH ALGORITHM +Each class is assigned a deficit counter, initialized to +.B quantum. + +DRR maintains an (internal) ''active'' list of classes whose qdiscs are +non-empty. This list is used for dequeuing. A packet is dequeued from +the class at the head of the list if the packet size is smaller or equal +to the deficit counter. If the counter is too small, it is increased by +.B quantum +and the scheduler moves on to the next class in the active list. + + +.SH PARAMETERS +.TP +quantum +Amount of bytes a flow is allowed to dequeue before the scheduler moves to +the next class. Defaults to the MTU of the interface. The minimum value is 1. + +.SH EXAMPLE & USAGE + +To attach to device eth0, using the interface MTU as its quantum: +.P +# tc qdisc add dev eth0 handle 1 root drr +.P +Adding two classes: +.P +# tc class add dev eth0 parent 1: classid 1:1 drr +.br +# tc class add dev eth0 parent 1: classid 1:2 drr +.P +You also need to add at least one filter to classify packets. +.P +# tc filter add dev eth0 protocol .. classid 1:1 +.P + +Like SFQ, DRR is only useful when it owns the queue \-\- it is a pure scheduler and does +not delay packets. Attaching non-work-conserving qdiscs like tbf to it does not make +sense \-\- other qdiscs in the active list will also become inactive until the dequeue +operation succeeds. Embed DRR within another qdisc like HTB or HFSC to ensure it owns the queue. +.P +You can mimic SFQ behavior by assigning packets to the attached classes using the +flow filter: + +.B tc qdisc add dev .. drr + +.B for i in .. 1024;do +.br +.B "\ttc class add dev .. classid $handle:$(print %x $i)" +.br +.B "\ttc qdisc add dev .. fifo limit 16" +.br +.B done + +.B tc filter add .. protocol ip .. $handle flow hash keys src,dst,proto,proto-src,proto-dst divisor 1024 perturb 10 + + +.SH SOURCE +.TP +o +M. Shreedhar and George Varghese "Efficient Fair +Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + +.SH NOTES + +This implementation does not drop packets from the longest queue on overrun, +as limits are handled by the individual child qdiscs. + +.SH SEE ALSO +.BR tc (8), +.BR tc-htb (8), +.BR tc-sfq (8) + +.SH AUTHOR +sched_drr was written by Patrick McHardy. diff --git a/man/man8/tc-ematch.8 b/man/man8/tc-ematch.8 new file mode 100644 index 0000000..3df870f --- /dev/null +++ b/man/man8/tc-ematch.8 @@ -0,0 +1,160 @@ +.TH ematch 8 "6 August 2012" iproute2 Linux +. +.SH NAME +ematch \- extended matches for use with "basic", "cgroup" or "flow" filters +. +.SH SYNOPSIS +.sp +.ad l +.B "tc filter add .. basic match" +.RI EXPR +.B .. flowid .. +.sp + +.IR EXPR " := " TERM " [ { " +.B and | or +} +.IR EXPR +] + +.IR TERM " := [ " \fBnot " ] { " MATCH " | '(' " EXPR " ')' } " + +.IR MATCH " := " module " '(' " ARGS " ')' " + +.IR ARGS " := " ARG1 " " ARG2 " .. + +.SH MATCHES + +.SS cmp +Simple comparison ematch: arithmetic compare of packet data to a given value. + +.IR cmp "( " ALIGN " at " OFFSET " [ " ATTRS " ] { " eq " | " lt " | " gt " } " VALUE " ) + +.IR ALIGN " := { " u8 " | " u16 " | " u32 " } " + +.IR ATTRS " := [ layer " LAYER " ] [ mask " MASK " ] [ trans ] + +.IR LAYER " := { " link " | " network " | " transport " | " 0..2 " } + +.SS meta +Metadata ematch + +.IR meta "( " OBJECT " { " eq " | " lt " |" gt " } " OBJECT " ) + +.IR OBJECT " := { " META_ID " | " VALUE " } + +.IR META_ID " := " id " [ shift " SHIFT " ] [ mask " MASK " ] + +.TP +meta attributes: + +\fBrandom\fP 32 bit random value + +\fBloadavg_1\fP Load average in last 5 minutes + +\fBnf_mark\fP Netfilter mark + +\fBvlan\fP Vlan tag + +\fBsk_rcvbuf\fP Receive buffer size + +\fBsk_snd_queue\fP Send queue length + +.PP +A full list of meta attributes can be obtained via + +# tc filter add dev eth1 basic match 'meta(list)' + +.SS nbyte +match packet data byte sequence + +.IR nbyte "( " NEEDLE " at " OFFSET " [ layer " LAYER " ] ) + +.IR NEEDLE " := { " string " | " c-escape-sequence " } " + +.IR OFFSET " := " int + +.IR LAYER " := { " link " | " network " | " transport " | " 0..2 " } + +.SS u32 +u32 ematch + +.IR u32 "( " ALIGN " " VALUE " " MASK " at [ nexthdr+ ] " OFFSET " ) + +.IR ALIGN " := { " u8 " | " u16 " | " u32 " } + +.SS ipset +test packet against ipset membership + +.IR ipset "( " SETNAME " " FLAGS " ) + +.IR SETNAME " := " string + +.IR FLAGS " := { " FLAG " [, " FLAGS "] } + +The flag options are the same as those used by the iptables "set" match. + +When using the ipset ematch with the "ip_set_hash:net,iface" set type, +the interface can be queried using "src,dst (source ip address, outgoing interface) or +"src,src" (source ip address, incoming interface) syntax. + +.SS ipt +test packet against xtables matches + +.IR ipt "( " [-6] " "-m " " MATCH_NAME " " FLAGS " ) + +.IR MATCH_NAME " := " string + +.IR FLAGS " := { " FLAG " [, " FLAGS "] } + +The flag options are the same as those used by the xtable match used. + +.SS canid +ematch rule to match CAN frames + +.IR canid "( " IDLIST " ) + +.IR IDLIST " := " IDSPEC [ IDLIST ] + +.IR IDSPEC " := { ’sff’ " CANID " | ’eff’ " CANID " } + +.IR CANID " := " ID [ ":MASK" ] + +.IR ID ", " MASK " := hexadecimal number (i.e. 0x123) + +.SH CAVEATS + +The ematch syntax uses '(' and ')' to group expressions. All braces need to be +escaped properly to prevent shell commandline from interpreting these directly. + +When using the ipset ematch with the "ifb" device, the outgoing device will be the +ifb device itself, e.g. "ifb0". +The original interface (i.e. the device the packet arrived on) is treated as the incoming interface. + +.SH EXAMPLE & USAGE + +# tc filter add .. basic match ... + +# 'cmp(u16 at 3 layer 2 mask 0xff00 gt 20)' + +# 'meta(nfmark gt 24)' and 'meta(tcindex mask 0xf0 eq 0xf0)' + +# 'nbyte("ababa" at 12 layer 1)' + +# 'u32(u16 0x1122 0xffff at nexthdr+4)' + +Check if packet source ip address is member of set named \fBbulk\fP: + +# 'ipset(bulk src)' + +Check if packet source ip and the interface the packet arrived on is member of "hash:net,iface" set named \fBinteractive\fP: + +# 'ipset(interactive src,src)' + +Check if packet matches an IPSec state with reqid 1: + +# 'ipt(-m policy --dir in --pol ipsec --reqid 1)' + +.SH "AUTHOR" + +The extended match infrastructure was added by Thomas Graf. diff --git a/man/man8/tc-etf.8 b/man/man8/tc-etf.8 new file mode 100644 index 0000000..4cb3b9e --- /dev/null +++ b/man/man8/tc-etf.8 @@ -0,0 +1,151 @@ +.TH ETF 8 "05 Jul 2018" "iproute2" "Linux" +.SH NAME +ETF \- Earliest TxTime First (ETF) Qdisc +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B parent +classid +.B [ handle +major: +.B ] etf clockid +clockid +.B [ delta +delta_nsecs +.B ] [ deadline_mode ] +.B [ offload ] + +.SH DESCRIPTION +The ETF (Earliest TxTime First) qdisc allows applications to control +the instant when a packet should be dequeued from the traffic control +layer into the netdevice. If +.B offload +is configured and supported by the network interface card, the it will +also control when packets leave the network controller. + +ETF achieves that by buffering packets until a configurable time +before their transmission time (i.e. txtime, or deadline), which can +be configured through the +.B delta +option. + +The qdisc uses a rb-tree internally so packets are always 'ordered' by +their txtime and will be dequeued following the (next) earliest txtime +first. + +It relies on the SO_TXTIME socket option and the SCM_TXTIME CMSG in +each packet field to configure the behavior of time dependent sockets: +the clockid to be used as a reference, if the expected mode of txtime +for that socket is deadline or strict mode, and if packet drops should +be reported on the socket's error queue. See +.BR socket(7) +for more information. + +The etf qdisc will drop any packets with a txtime in the past, or if a +packet expires while waiting for being dequeued. + +This queueing discipline is intended to be used by TSN (Time Sensitive +Networking) applications, and it exposes a traffic shaping functionality +that is commonly documented as "Launch Time" or "Time-Based Scheduling" +by vendors and the documentation of network interface controllers. + +ETF is meant to be installed under another qdisc that maps packet flows +to traffic classes, one example is +.BR mqprio(8). + +.SH PARAMETERS +.TP +clockid +.br +Specifies the clock to be used by qdisc's internal timer for measuring +time and scheduling events. The qdisc expects that packets passing +through it to be using this same +.B clockid +as the reference of their txtime timestamps. It will drop packets +coming from sockets that do not comply with that. + +For more information about time and clocks on Linux, please refer +to +.BR time(7) +and +.BR clock_gettime(3). + +.TP +delta +.br +After enqueueing or dequeueing a packet, the qdisc will schedule its +next wake-up time for the next txtime minus this delta value. +This means +.B delta +can be used as a fudge factor for the scheduler latency of a system. +This value must be specified in nanoseconds. +The default value is 0 nanoseconds. + +.TP +deadline_mode +.br +When +.B deadline_mode +is set, the qdisc will handle txtime with a different semantics, +changed from a 'strict' transmission time to a deadline. +In practice, this means during the dequeue flow +.BR etf(8) +will set the txtime of the packet being dequeued to 'now'. +The default is for this option to be disabled. + +.TP +offload +.br +When +.B offload +is set, +.BR etf(8) +will try to configure the network interface so time-based transmission +arbitration is enabled in the controller. This feature is commonly +referred to as "Launch Time" or "Time-Based Scheduling" by the +documentation of network interface controllers. +The default is for this option to be disabled. + +.TP +skip_sock_check +.br +.BR etf(8) +currently drops any packet which does not have a socket associated with it or +if the socket does not have SO_TXTIME socket option set. But, this will not +work if the launchtime is set by another entity inside the kernel (e.g. some +other Qdisc). Setting the skip_sock_check will skip checking for a socket +associated with the packet. + +.SH EXAMPLES + +ETF is used to enforce a Quality of Service. It controls when each +packets should be dequeued and transmitted, and can be used for +limiting the data rate of a traffic class. To separate packets into +traffic classes the user may choose +.BR mqprio(8), +and configure it like this: + +.EX +# tc qdisc add dev eth0 handle 100: parent root mqprio num_tc 3 \\ + map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ + queues 1@0 1@1 2@2 \\ + hw 0 +.EE +.P +To replace the current queueing discipline by ETF in traffic class +number 0, issue: +.P +.EX +# tc qdisc replace dev eth0 parent 100:1 etf \\ + clockid CLOCK_TAI delta 300000 offload +.EE + +With the options above, etf will be configured to use CLOCK_TAI as +its clockid_t, will schedule packets for 300 us before their txtime, +and will enable the functionality on that in the network interface +card. Deadline mode will not be configured for this mode. + +.SH AUTHORS +Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> +.br +Vinicius Costa Gomes <vinicius.gomes@intel.com> diff --git a/man/man8/tc-ets.8 b/man/man8/tc-ets.8 new file mode 100644 index 0000000..d3e6816 --- /dev/null +++ b/man/man8/tc-ets.8 @@ -0,0 +1,192 @@ +.TH TC 8 "December 2019" "iproute2" "Linux" +.SH NAME +ETS \- Enhanced Transmission Selection scheduler +.SH SYNOPSIS +.B tc qdisc ... ets [ bands +number +.B ] [ strict +number +.B ] [ quanta +bytes bytes bytes... +.B ] [ priomap +band band band... +.B ] + +.B tc class ... ets [ quantum +bytes +.B ] + +.SH DESCRIPTION + +The Enhanced Transmission Selection scheduler is a classful queuing +discipline that merges functionality of PRIO and DRR qdiscs in one +scheduler. ETS makes it easy to configure a set of strict and +bandwidth-sharing bands to implement the transmission selection described +in 802.1Qaz. + +On creation with 'tc qdisc add', a fixed number of bands is created. Each +band is a class, although it is not possible to directly add and remove +bands with 'tc class' commands. The number of bands to be created must +instead be specified on the command line as the qdisc is added. + +The minor number of classid to use when referring to a band is the band +number increased by one. Thus band 0 will have classid of major:1, band 1 +that of major:2, etc. + +ETS bands are of two types: some number may be in strict mode, the +remaining ones are in bandwidth-sharing mode. + +.SH ALGORITHM +When dequeuing, strict bands are tried first, if there are any. Band 0 is +tried first. If it did not deliver a packet, band 1 is tried next, and so +on until one of the bands delivers a packet, or the strict bands are +exhausted. + +If no packet has been dequeued from any of the strict bands, if there are +any bandwidth-sharing bands, the dequeuing proceeds according to the DRR +algorithm. Each bandwidth-sharing band is assigned a deficit counter, +initialized to quantum assigned by a +.B quanta +element. ETS maintains an (internal) ''active'' list of bandwidth-sharing +bands whose qdiscs are non-empty. This list is used for dequeuing. A packet +is dequeued from the band at the head of the list if the packet size is +smaller or equal to the deficit counter. If the counter is too small, it is +increased by +.B quantum +and the scheduler moves on to the next band in the active list. + +Only qdiscs that own their queue should be added below the +bandwidth-sharing bands. Attaching to them non-work-conserving qdiscs like +TBF does not make sense \-\- other qdiscs in the active list will be +skipped until the dequeue operation succeeds. This limitation does not +exist with the strict bands. + +.SH CLASSIFICATION +The ETS qdisc allows three ways to decide which band to enqueue a packet +to: + +- Packet priority can be directly set to a class handle, in which case that + is the queue where the packet will be put. For example, band number 2 of + a qdisc with handle of 11: will have classid 11:3. To mark a packet for + queuing to this band, the packet priority should be set to 0x110003. + +- A tc filter attached to the qdisc can put the packet to a band by using + the \fBflowid\fR keyword. + +- As a last resort, the ETS qdisc consults its priomap (see below), which + maps packets to bands based on packet priority. + +.SH PARAMETERS +.TP +strict +The number of bands that should be created in strict mode. If not given, +this value is 0. + +.TP +quanta +Each bandwidth-sharing band needs to know its quantum, which is the amount +of bytes a band is allowed to dequeue before the scheduler moves to the +next bandwidth-sharing band. The +.B quanta +argument lists quanta for the individual bandwidth-sharing bands. +The minimum value of each quantum is 1. If +.B quanta +is not given, the default is no bandwidth-sharing bands, but note that when +specifying a large number of +.B bands, +the extra ones are in bandwidth-sharing mode by default. + +.TP +bands +Number of bands given explicitly. This value has to be at least large +enough to cover the strict bands specified through the +.B strict +keyword and bandwidth-sharing bands specified in +.B quanta. +If a larger value is given, any extra bands are in bandwidth-sharing mode, +and their quanta are deduced from the interface MTU. If no value is given, +as many bands are created as necessary to cover all bands implied by the +.B strict +and +.B quanta +keywords. + +.TP +priomap +The priomap maps the priority of a packet to a band. The argument is a list +of numbers. The first number indicates which band the packets with priority +0 should be put to, the second is for priority 1, and so on. + +There can be up to 16 numbers in the list. If there are fewer, the default +band that traffic with one of the unmentioned priorities goes to is the +last one. + +.SH EXAMPLE & USAGE + +.P +Add a qdisc with 8 bandwidth-sharing bands, using the interface MTU as +their quanta. Since all quanta are the same, this will lead to equal +distribution of bandwidth between the bands, each will get about 12.5% of +the link. The low 8 priorities go to individual bands in a reverse 1:1 +fashion (such that the highest priority goes to the first band). + +.P +# tc qdisc add dev eth0 root handle 1: ets bands 8 priomap 7 6 5 4 3 2 1 0 +.br +# tc qdisc show dev eth0 +.br +qdisc ets 1: root refcnt 2 bands 8 quanta 1514 1514 1514 1514 1514 1514 1514 1514 priomap 7 6 5 4 3 2 1 0 7 7 7 7 7 7 7 7 + +.P +Tweak the first band of the above qdisc to give it a quantum of 2650, which +will give it about 20% of the link (and about 11.5% to the remaining +bands): + +.P +# tc class change dev eth0 classid 1:1 ets quantum 2650 +.br +# tc qdisc show dev eth0 +.br +qdisc ets 1: root refcnt 2 bands 8 quanta 2650 1514 1514 1514 1514 1514 1514 1514 priomap 7 6 5 4 3 2 1 0 7 7 7 7 7 7 7 7 + +.P +Create a purely strict Qdisc with reverse 1:1 mapping between priorities +and bands: + +.P +# tc qdisc add dev eth0 root handle 1: ets strict 8 priomap 7 6 5 4 3 2 1 0 +.br +# tc qdisc sh dev eth0 +.br +qdisc ets 1: root refcnt 2 bands 8 strict 8 priomap 7 6 5 4 3 2 1 0 7 7 7 7 7 7 7 7 + +.P +Add a Qdisc with 6 bands, 3 strict and 3 ETS with 35%-30%-25% weights: +.P +# tc qdisc add dev eth0 root handle 1: ets strict 3 quanta 3500 3000 2500 priomap 0 1 1 1 2 3 4 5 +.br +# tc qdisc sh dev eth0 +.br +qdisc ets 1: root refcnt 2 bands 6 strict 3 quanta 3500 3000 2500 priomap 0 1 1 1 2 3 4 5 5 5 5 5 5 5 5 5 + +.P +Create a Qdisc such that traffic with priorities 2, 3 and 4 are strictly +prioritized over other traffic, and the rest goes into bandwidth-sharing +classes with equal weights: +.P +# tc qdisc add dev eth0 root handle 1: ets bands 8 strict 3 priomap 3 4 0 1 2 5 6 7 +.br +# tc qdisc sh dev eth0 +.br +qdisc ets 1: root refcnt 2 bands 8 strict 3 quanta 1514 1514 1514 1514 1514 priomap 3 4 0 1 2 5 6 7 7 7 7 7 7 7 7 7 + +.SH SEE ALSO +.BR tc (8), +.BR tc-prio (8), +.BR tc-drr (8) + +.SH AUTHOR +Parts of both this manual page and the code itself are taken from PRIO and +DRR qdiscs. +.br +ETS qdisc itself was written by Petr Machata. diff --git a/man/man8/tc-flow.8 b/man/man8/tc-flow.8 new file mode 100644 index 0000000..54f6bf7 --- /dev/null +++ b/man/man8/tc-flow.8 @@ -0,0 +1,267 @@ +.TH "Flow filter in tc" 8 "20 Oct 2015" "iproute2" "Linux" + +.SH NAME +flow \- flow based traffic control filter +.SH SYNOPSIS +.TP +Mapping mode: + +.RS +.in +8 +.ti -8 +.BR tc " " filter " ... " "flow map key " +.IR KEY " [ " OPS " ] [ " OPTIONS " ] " +.RE +.TP +Hashing mode: + +.RS +.in +8 +.ti -8 +.BR tc " " filter " ... " "flow hash keys " +.IR KEY_LIST " [ " +.B perturb +.IR secs " ] [ " OPTIONS " ] " +.RE + +.in +8 +.ti -8 +.IR OPS " := [ " OPS " ] " OP + +.ti -8 +.IR OPTIONS " := [ " +.B divisor +.IR NUM " ] [ " +.B baseclass +.IR ID " ] [ " +.B match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ]" + +.ti -8 +.IR KEY_LIST " := [ " KEY_LIST " ] " KEY + +.ti -8 +.IR OP " := { " +.BR or " | " and " | " xor " | " rshift " | " addend " } " +.I NUM + +.ti -8 +.IR ID " := " X : Y + +.ti -8 +.IR KEY " := { " +.BR src " | " dst " | " proto " | " proto-src " | " proto-dst " | " iif " | " +.BR priority " | " mark " | " nfct " | " nfct-src " | " nfct-dst " | " +.BR nfct-proto-src " | " nfct-proto-dst " | " rt-classid " | " sk-uid " | " +.BR sk-gid " | " vlan-tag " | " rxhash " }" +.SH DESCRIPTION +The +.B flow +classifier is meant to extend the +.B SFQ +hashing capabilities without hard-coding new hash functions. It also allows +deterministic mappings of keys to classes. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI baseclass " ID" +An offset for the resulting class ID. +.I ID +may be +.BR root ", " none +or a hexadecimal class ID in the form [\fIX\fB:\fR]\fIY\fR. \fIX\fR must +match qdisc's/class's major handle (if omitted, the correct value is chosen +automatically). If the whole \fBbaseclass\fR is omitted, \fIY\fR defaults +to 1. +.TP +.BI divisor " NUM" +Number of buckets to use for sorting into. Keys are calculated modulo +.IR NUM . +.TP +.BI "hash keys " KEY-LIST +Perform a +.B jhash2 +operation over the keys in +.IR KEY-LIST , +the result (modulo the +.B divisor +if given) is taken as class ID, optionally offset by the value of +.BR baseclass . +It is possible to specify an interval (in seconds) after which +.BR jhash2 's +entropy source is recreated using the +.B perturb +parameter. +.TP +.BI "map key " KEY +Packet data identified by +.I KEY +is translated into class IDs to push the packet into. The value may be mangled by +.I OPS +before using it for the mapping. They are applied in the order listed here: +.RS +.TP 4 +.BI and " NUM" +Perform bitwise +.B AND +operation with numeric value +.IR NUM . +.TP +.BI or " NUM" +Perform bitwise +.B OR +operation with numeric value +.IR NUM . +.TP +.BI xor " NUM" +Perform bitwise +.B XOR +operation with numeric value +.IR NUM . +.TP +.BI rshift " NUM" +Shift the value of +.I KEY +to the right by +.I NUM +bits. +.TP +.BI addend " NUM" +Add +.I NUM +to the value of +.IR KEY . + +.RE +.RS +For the +.BR or ", " and ", " xor " and " rshift +operations, +.I NUM +is assumed to be an unsigned, 32bit integer value. For the +.B addend +operation, +.I NUM +may be much more complex: It may be prefixed by a minus ('-') sign to cause +subtraction instead of addition and for keys of +.BR src ", " dst ", " nfct-src " and " nfct-dst +it may be given in IP address notation. See below for an illustrating example. +.RE +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH KEYS +In mapping mode, a single key is used (after optional permutation) to build a +class ID. The resulting ID is deducible in most cases. In hashing more, a number +of keys may be specified which are then hashed and the output used as class ID. +This ID is not deducible in beforehand, and may even change over time for a +given flow if a +.B perturb +interval has been given. + +The range of class IDs can be limited by the +.B divisor +option, which is used for a modulus. +.TP +.BR src ", " dst +Use source or destination address as key. In case of IPv4 and TIPC, this is the +actual address value. For IPv6, the 128bit address is folded into a 32bit value +by XOR'ing the four 32bit words. In all other cases, the kernel-internal socket +address is used (after folding into 32bits on 64bit systems). +.TP +.B proto +Use the layer four protocol number as key. +.TP +.B proto-src +Use the layer four source port as key. If not available, the kernel-internal +socket address is used instead. +.TP +.B proto-dst +Use the layer four destination port as key. If not available, the associated +kernel-internal dst_entry address is used after XOR'ing with the packet's +layer three protocol number. +.TP +.B iif +Use the incoming interface index as key. +.TP +.B priority +Use the packet's priority as key. Usually this is the IP header's DSCP/ECN +value. +.TP +.B mark +Use the netfilter +.B fwmark +as key. +.TP +.B nfct +Use the associated conntrack entry address as key. +.TP +.BR nfct-src ", " nfct-dst ", " nfct-proto-src ", " nfct-proto-dst +These are conntrack-aware variants of +.BR src ", " dst ", " proto-src " and " proto-dst . +In case of NAT, these are basically the packet header's values before NAT was +applied. +.TP +.B rt-classid +Use the packet's destination routing table entry's realm as key. +.TP +.B sk-uid +.TQ +.B sk-gid +For locally generated packets, use the user or group ID the originating socket +belongs to as key. +.TP +.B vlan-tag +Use the packet's vlan ID as key. +.TP +.B rxhash +Use the flow hash as key. + +.SH EXAMPLES +.TP +Classic SFQ hash: + +.EX +tc filter add ... flow hash \\ + keys src,dst,proto,proto-src,proto-dst divisor 1024 +.EE +.TP +Classic SFQ hash, but using information from conntrack to work properly in combination with NAT: + +.EX +tc filter add ... flow hash \\ + keys nfct-src,nfct-dst,proto,nfct-proto-src,nfct-proto-dst \\ + divisor 1024 +.EE +.TP +Map destination IPs of 192.168.0.0/24 to classids 1-256: + +.EX +tc filter add ... flow map \\ + key dst addend -192.168.0.0 divisor 256 +.EE +.TP +Alternative to the above: + +.EX +tc filter add ... flow map \\ + key dst and 0xff +.EE +.TP +The same, but in reverse order: + +.EX +tc filter add ... flow map \\ + key dst and 0xff xor 0xff +.EE +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8), +.BR tc-sfq (8) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 new file mode 100644 index 0000000..8324581 --- /dev/null +++ b/man/man8/tc-flower.8 @@ -0,0 +1,557 @@ +.TH "Flower filter in tc" 8 "22 Oct 2015" "iproute2" "Linux" + +.SH NAME +flower \- flow based traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " flower " [ " +.IR MATCH_LIST " ] [ " +.B action +.IR ACTION_SPEC " ] [ " +.B classid +.IR CLASSID " ] [ " +.B hw_tc +.IR TCID " ]" + + +.ti -8 +.IR MATCH_LIST " := [ " MATCH_LIST " ] " MATCH + +.ti -8 +.IR MATCH " := { " +.B indev +.IR ifname " | " +.BR verbose +.RI " | " +.BR skip_sw " | " skip_hw +.RI " | { " +.BR dst_mac " | " src_mac " } " +.IR MASKED_LLADDR " | " +.B vlan_id +.IR VID " | " +.B vlan_prio +.IR PRIORITY " | " +.BR vlan_ethtype " { " ipv4 " | " ipv6 " | " +.IR ETH_TYPE " } | " +.B cvlan_id +.IR VID " | " +.B cvlan_prio +.IR PRIORITY " | " +.BR cvlan_ethtype " { " ipv4 " | " ipv6 " | " +.IR ETH_TYPE " } | " +.B pppoe_sid +.IR PSID " | " +.BR ppp_proto " { " ip " | " ipv6 " | " mpls_uc " | " mpls_mc " | " +.IR PPP_PROTO " } | " +.B mpls +.IR LSE_LIST " | " +.B mpls_label +.IR LABEL " | " +.B mpls_tc +.IR TC " | " +.B mpls_bos +.IR BOS " | " +.B mpls_ttl +.IR TTL " | " +.B l2tpv3_sid +.IR LSID " | " +.BR ip_proto " { " tcp " | " udp " | " sctp " | " icmp " | " icmpv6 " | " l2tp " | " +.IR IP_PROTO " } | " +.B ip_tos +.IR MASKED_IP_TOS " | " +.B ip_ttl +.IR MASKED_IP_TTL " | { " +.BR dst_ip " | " src_ip " } " +.IR PREFIX " | { " +.BR dst_port " | " src_port " } { " +.IR MASKED_NUMBER " | " +.IR min_port_number-max_port_number " } | " +.B tcp_flags +.IR MASKED_TCP_FLAGS " | " +.B type +.IR MASKED_TYPE " | " +.B code +.IR MASKED_CODE " | { " +.BR arp_tip " | " arp_sip " } " +.IR IPV4_PREFIX " | " +.BR arp_op " { " request " | " reply " | " +.IR OP " } | { " +.BR arp_tha " | " arp_sha " } " +.IR MASKED_LLADDR " | " +.B enc_key_id +.IR KEY-ID " | {" +.BR enc_dst_ip " | " enc_src_ip " } { " +.IR ipv4_address " | " ipv6_address " } | " +.B enc_dst_port +.IR port_number " | " +.B enc_tos +.IR TOS " | " +.B enc_ttl +.IR TTL " | " +{ +.B geneve_opts +| +.B vxlan_opts +| +.B erspan_opts +| +.B gtp_opts +} +.IR OPTIONS " | " +.BR ip_flags +.IR IP_FLAGS " | " +.B l2_miss +.IR L2_MISS " | " +.BR cfm +.IR CFM_OPTIONS " }" + +.ti -8 +.IR LSE_LIST " := [ " LSE_LIST " ] " LSE + +.ti -8 +.IR LSE " := " +.B lse depth +.IR DEPTH " { " +.B label +.IR LABEL " | " +.B tc +.IR TC " | " +.B bos +.IR BOS " | " +.B ttl +.IR TTL " }" + +.ti -8 +.IR CFM " := " +.B cfm mdl +.IR LEVEL " | " +.B op +.IR OPCODE " + +.SH DESCRIPTION +The +.B flower +filter matches flows to the set of keys specified and assigns an arbitrarily +chosen class ID to packets belonging to them. Additionally (or alternatively) an +action from the generic action framework may be called. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Specify a class to pass matching packets on to. +.I CLASSID +is in the form +.BR X : Y ", while " X " and " Y +are interpreted as numbers in hexadecimal format. +.TP +.BI hw_tc " TCID" +Specify a hardware traffic class to pass matching packets on to. TCID is in the +range 0 through 15. +.TP +.BI indev " ifname" +Match on incoming interface name. Obviously this makes sense only for forwarded +flows. +.I ifname +is the name of an interface which must exist at the time of +.B tc +invocation. +.TP +.BI verbose +Enable verbose logging, including offloading errors when not using +.B skip_sw +flag. +.TP +.BI skip_sw +Do not process filter by software. If hardware has no offload support for this +filter, or TC offload is not enabled for the interface, operation will fail. +.TP +.BI skip_hw +Do not process filter by hardware. +.TP +.BI dst_mac " MASKED_LLADDR" +.TQ +.BI src_mac " MASKED_LLADDR" +Match on source or destination MAC address. A mask may be optionally +provided to limit the bits of the address which are matched. A mask is +provided by following the address with a slash and then the mask. It may be +provided in LLADDR format, in which case it is a bitwise mask, or as a +number of high bits to match. If the mask is missing then a match on all +bits is assumed. +.TP +.BI num_of_vlans " NUM" +Match on the number of vlan tags in the packet. +.I NUM +can be 0 or small positive integer. Typically in 0-4 range. +.TP +.BI vlan_id " VID" +Match on vlan tag id. +.I VID +is an unsigned 12bit value in decimal format. +.TP +.BI vlan_prio " PRIORITY" +Match on vlan tag priority. +.I PRIORITY +is an unsigned 3bit value in decimal format. +.TP +.BI vlan_ethtype " VLAN_ETH_TYPE" +Match on layer three protocol. +.I VLAN_ETH_TYPE +may be either +.BR ipv4 ", " ipv6 +or an unsigned 16bit value in hexadecimal format. To match on QinQ packet, it must be 802.1Q or 802.1AD. +.TP +.BI cvlan_id " VID" +Match on QinQ inner vlan tag id. +.I VID +is an unsigned 12bit value in decimal format. +.TP +.BI cvlan_prio " PRIORITY" +Match on QinQ inner vlan tag priority. +.I PRIORITY +is an unsigned 3bit value in decimal format. +.TP +.BI cvlan_ethtype " VLAN_ETH_TYPE" +Match on QinQ layer three protocol. +.I VLAN_ETH_TYPE +may be either +.BR ipv4 ", " ipv6 +or an unsigned 16bit value in hexadecimal format. +.TP +.BI pppoe_sid " PSID" +Match on PPPoE session id. +.I PSID +is an unsigned 16bit value in decimal format. +.TP +.BI ppp_proto " PPP_PROTO" +Match on PPP layer three protocol. +.I PPP_PROTO +may be either +.BR ip ", " ipv6 ", " mpls_uc ", " mpls_mc +or an unsigned 16bit value in hexadecimal format. +.TP +.BI mpls " LSE_LIST" +Match on the MPLS label stack. +.I LSE_LIST +is a list of Label Stack Entries, each introduced by the +.BR lse " keyword." +This option can't be used together with the standalone +.BR mpls_label ", " mpls_tc ", " mpls_bos " and " mpls_ttl " options." +.RS +.TP +.BI lse " LSE_OPTIONS" +Match on an MPLS Label Stack Entry. +.I LSE_OPTIONS +is a list of options that describe the properties of the LSE to match. +.RS +.TP +.BI depth " DEPTH" +The depth of the Label Stack Entry to consider. Depth starts at 1 (the +outermost Label Stack Entry). The maximum usable depth may be limited by the +kernel. This option is mandatory. +.I DEPTH +is an unsigned 8 bit value in decimal format. +.TP +.BI label " LABEL" +Match on the MPLS Label field at the specified +.BR depth . +.I LABEL +is an unsigned 20 bit value in decimal format. +.TP +.BI tc " TC" +Match on the MPLS Traffic Class field at the specified +.BR depth . +.I TC +is an unsigned 3 bit value in decimal format. +.TP +.BI bos " BOS" +Match on the MPLS Bottom Of Stack field at the specified +.BR depth . +.I BOS +is a 1 bit value in decimal format. +.TP +.BI ttl " TTL" +Match on the MPLS Time To Live field at the specified +.BR depth . +.I TTL +is an unsigned 8 bit value in decimal format. +.RE +.RE + +.TP +.BI mpls_label " LABEL" +Match the label id in the outermost MPLS label stack entry. +.I LABEL +is an unsigned 20 bit value in decimal format. +.TP +.BI mpls_tc " TC" +Match on the MPLS TC field, which is typically used for packet priority, +in the outermost MPLS label stack entry. +.I TC +is an unsigned 3 bit value in decimal format. +.TP +.BI mpls_bos " BOS" +Match on the MPLS Bottom Of Stack field in the outermost MPLS label stack +entry. +.I BOS +is a 1 bit value in decimal format. +.TP +.BI mpls_ttl " TTL" +Match on the MPLS Time To Live field in the outermost MPLS label stack +entry. +.I TTL +is an unsigned 8 bit value in decimal format. +.TP +.BI l2tpv3_sid " LSID" +Match on L2TPv3 session id field transported over IPv4 or IPv6. +.I LSID +is an unsigned 32 bit value in decimal format. +.TP +.BI ip_proto " IP_PROTO" +Match on layer four protocol. +.I IP_PROTO +may be +.BR tcp ", " udp ", " sctp ", " icmp ", " icmpv6 ", " l2tp +or an unsigned 8bit value in hexadecimal format. +.TP +.BI ip_tos " MASKED_IP_TOS" +Match on ipv4 TOS or ipv6 traffic-class - eight bits in hexadecimal format. +A mask may be optionally provided to limit the bits which are matched. A mask +is provided by following the value with a slash and then the mask. If the mask +is missing then a match on all bits is assumed. +.TP +.BI ip_ttl " MASKED_IP_TTL" +Match on ipv4 TTL or ipv6 hop-limit - eight bits value in decimal or hexadecimal format. +A mask may be optionally provided to limit the bits which are matched. Same +logic is used for the mask as with matching on ip_tos. +.TP +.BI dst_ip " PREFIX" +.TQ +.BI src_ip " PREFIX" +Match on source or destination IP address. +.I PREFIX +must be a valid IPv4 or IPv6 address, depending on the \fBprotocol\fR +option to tc filter, optionally followed by a slash and the prefix length. +If the prefix is missing, \fBtc\fR assumes a full-length host match. +.TP +.IR \fBdst_port " { " MASKED_NUMBER " | " " MIN_VALUE-MAX_VALUE " } +.TQ +.IR \fBsrc_port " { " MASKED_NUMBER " | " " MIN_VALUE-MAX_VALUE " } +Match on layer 4 protocol source or destination port number, with an +optional mask. Alternatively, the minimum and maximum values can be +specified to match on a range of layer 4 protocol source or destination +port numbers. Only available for +.BR ip_proto " values " udp ", " tcp " and " sctp +which have to be specified in beforehand. +.TP +.BI tcp_flags " MASKED_TCP_FLAGS" +Match on TCP flags represented as 12bit bitfield in in hexadecimal format. +A mask may be optionally provided to limit the bits which are matched. A mask +is provided by following the value with a slash and then the mask. If the mask +is missing then a match on all bits is assumed. +.TP +.BI type " MASKED_TYPE" +.TQ +.BI code " MASKED_CODE" +Match on ICMP type or code. A mask may be optionally provided to limit the +bits of the address which are matched. A mask is provided by following the +address with a slash and then the mask. The mask must be as a number which +represents a bitwise mask If the mask is missing then a match on all bits +is assumed. Only available for +.BR ip_proto " values " icmp " and " icmpv6 +which have to be specified in beforehand. +.TP +.BI arp_tip " IPV4_PREFIX" +.TQ +.BI arp_sip " IPV4_PREFIX" +Match on ARP or RARP sender or target IP address. +.I IPV4_PREFIX +must be a valid IPv4 address optionally followed by a slash and the prefix +length. If the prefix is missing, \fBtc\fR assumes a full-length host +match. +.TP +.BI arp_op " ARP_OP" +Match on ARP or RARP operation. +.I ARP_OP +may be +.BR request ", " reply +or an integer value 0, 1 or 2. A mask may be optionally provided to limit +the bits of the operation which are matched. A mask is provided by +following the address with a slash and then the mask. It may be provided as +an unsigned 8 bit value representing a bitwise mask. If the mask is missing +then a match on all bits is assumed. +.TP +.BI arp_sha " MASKED_LLADDR" +.TQ +.BI arp_tha " MASKED_LLADDR" +Match on ARP or RARP sender or target MAC address. A mask may be optionally +provided to limit the bits of the address which are matched. A mask is +provided by following the address with a slash and then the mask. It may be +provided in LLADDR format, in which case it is a bitwise mask, or as a +number of high bits to match. If the mask is missing then a match on all +bits is assumed. +.TP +.BI enc_key_id " NUMBER" +.TQ +.BI enc_dst_ip " PREFIX" +.TQ +.BI enc_src_ip " PREFIX" +.TQ +.BI enc_dst_port " NUMBER" +.TQ +.BI enc_tos " NUMBER" +.TQ +.BI enc_ttl " NUMBER" +.TQ +.BR +.TP +.BI ct_state " CT_STATE" +.TQ +.BI ct_zone " CT_MASKED_ZONE" +.TQ +.BI ct_mark " CT_MASKED_MARK" +.TQ +.BI ct_label " CT_MASKED_LABEL" +Matches on connection tracking info +.RS +.TP +.I CT_STATE +Match the connection state, and can be combination of [{+|-}flag] flags, where flag can be one of +.RS +.TP +trk - Tracked connection. +.TP +new - New connection. +.TP +est - Established connection. +.TP +rpl - The packet is in the reply direction, meaning that it is in the opposite direction from the packet that initiated the connection. +.TP +inv - The state is invalid. The packet couldn't be associated to a connection. +.TP +rel - The packet is related to an existing connection. +.TP +Example: +trk+est +.RE +.TP +.I CT_MASKED_ZONE +Match the connection zone, and can be masked. +.TP +.I CT_MASKED_MARK +32bit match on the connection mark, and can be masked. +.TP +.I CT_MASKED_LABEL +128bit match on the connection label, and can be masked. +.RE +.TP +.BI geneve_opts " OPTIONS" +.TQ +.BI vxlan_opts " OPTIONS" +.TQ +.BI erspan_opts " OPTIONS" +.TQ +.BI gtp_opts " OPTIONS" +Match on IP tunnel metadata. Key id +.I NUMBER +is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel). +.I PREFIX +must be a valid IPv4 or IPv6 address optionally followed by a slash and the +prefix length. If the prefix is missing, \fBtc\fR assumes a full-length +host match. Dst port +.I NUMBER +is a 16 bit UDP dst port. Tos +.I NUMBER +is an 8 bit tos (dscp+ecn) value, ttl +.I NUMBER +is an 8 bit time-to-live value. geneve_opts +.I OPTIONS +must be a valid list of comma-separated geneve options where each option +consists of a key optionally followed by a slash and corresponding mask. If +the masks is missing, \fBtc\fR assumes a full-length match. The options can +be described in the form CLASS:TYPE:DATA/CLASS_MASK:TYPE_MASK:DATA_MASK, +where CLASS is represented as a 16bit hexadecimal value, TYPE as an 8bit +hexadecimal value and DATA as a variable length hexadecimal value. +vxlan_opts +.I OPTIONS +doesn't support multiple options, and it consists of a key followed by a slash +and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length +match. The option can be described in the form GBP/GBP_MASK, where GBP is +represented as a 32bit number. +erspan_opts +.I OPTIONS +doesn't support multiple options, and it consists of a key followed by a slash +and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length +match. The option can be described in the form +VERSION:INDEX:DIR:HWID/VERSION:INDEX_MASK:DIR_MASK:HWID_MASK, where VERSION is +represented as a 8bit number, INDEX as an 32bit number, DIR and HWID as a 8bit +number. Multiple options is not supported. Note INDEX/INDEX_MASK is used when +VERSION is 1, and DIR/DIR_MASK and HWID/HWID_MASK are used when VERSION is 2. +gtp_opts +.I OPTIONS +doesn't support multiple options, and it consists of a key followed by a slash +and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length +match. The option can be described in the form PDU_TYPE:QFI/PDU_TYPE_MASK:QFI_MASK +where both PDU_TYPE and QFI are represented as a 8bit hexadecimal values. +.TP +.BI ip_flags " IP_FLAGS" +.I IP_FLAGS +may be either +.BR frag ", " nofrag ", " firstfrag " or " nofirstfrag +where frag and nofrag could be used to match on fragmented packets or not, +respectively. firstfrag and nofirstfrag can be used to further distinguish +fragmented packet. firstfrag can be used to indicate the first fragmented +packet. nofirstfrag can be used to indicates subsequent fragmented packets +or non-fragmented packets. +.TP + +.BI l2_miss " L2_MISS" +Match on layer 2 miss in the bridge driver's FDB / MDB. \fIL2_MISS\fR may be 0 +or 1. When 1, match on packets that encountered a layer 2 miss. When 0, match +on packets that were forwarded using an FDB / MDB entry. Note that broadcast +packets do not encounter a miss since a lookup is not performed for them. +.TP + +.BI cfm " CFM_OPTIONS" +Match on Connectivity Fault Management (CFM) fields. +.I CFM_OPTIONS +is a list of options that describe the properties of the CFM information +fields to match. +.RS +.TP +.BI mdl " LEVEL " +Match on the Maintenance Domain (MD) level field. +\fILEVEL\fR is an unsigned 3 bit value in decimal format. +.TP +.BI op " OPCODE " +Match on the CFM opcode field. \fIOPCODE\fR is an unsigned 8 bit value in +decimal format. + +.SH NOTES +As stated above where applicable, matches of a certain layer implicitly depend +on the matches of the next lower layer. Precisely, layer one and two matches +(\fBindev\fR, \fBdst_mac\fR and \fBsrc_mac\fR) +have no dependency, +MPLS and layer three matches +(\fBmpls\fR, \fBmpls_label\fR, \fBmpls_tc\fR, \fBmpls_bos\fR, \fBmpls_ttl\fR, +\fBip_proto\fR, \fBdst_ip\fR, \fBsrc_ip\fR, \fBarp_tip\fR, \fBarp_sip\fR, +\fBarp_op\fR, \fBarp_tha\fR, \fBarp_sha\fR and \fBip_flags\fR) +depend on the +.B protocol +option of tc filter, layer four port matches +(\fBdst_port\fR and \fBsrc_port\fR) +depend on +.B ip_proto +being set to +.BR tcp ", " udp " or " sctp, +and finally ICMP matches (\fBcode\fR and \fBtype\fR) depend on +.B ip_proto +being set to +.BR icmp " or " icmpv6. +.P +There can be only used one mask per one prio. If user needs to specify different +mask, he has to use different prio. +.SH SEE ALSO +.BR tc (8), +.BR tc-flow (8) diff --git a/man/man8/tc-fq.8 b/man/man8/tc-fq.8 new file mode 100644 index 0000000..27385aa --- /dev/null +++ b/man/man8/tc-fq.8 @@ -0,0 +1,107 @@ +.TH FQ 8 "10 Sept 2015" "iproute2" "Linux" +.SH NAME +FQ \- Fair Queue traffic policing +.SH SYNOPSIS +.B tc qdisc ... fq +[ +.B limit +PACKETS ] [ +.B flow_limit +PACKETS ] [ +.B quantum +BYTES ] [ +.B initial_quantum +BYTES ] [ +.B maxrate +RATE ] [ +.B buckets +NUMBER ] [ +.B orphan_mask +NUMBER ] [ +.B pacing +| +.B nopacing +] [ +.B ce_threshold +TIME ] + +.SH DESCRIPTION +FQ (Fair Queue) is a classless packet scheduler meant to be mostly +used for locally generated traffic. It is designed to achieve per flow pacing. +FQ does flow separation, and is able to respect pacing requirements set by TCP stack. +All packets belonging to a socket are considered as a 'flow'. +For non local packets (router workload), packet hash is used as fallback. + +An application can specify a maximum pacing rate using the +.B SO_MAX_PACING_RATE +setsockopt call. This packet scheduler adds delay between packets to +respect rate limitation set on each socket. Note that after linux-4.20, linux adopted EDT (Earliest Departure Time) +and TCP directly sets the appropriate Departure Time for each skb. + +Dequeueing happens in a round-robin fashion. +A special FIFO queue is reserved for high priority packets ( +.B TC_PRIO_CONTROL +priority), such packets are always dequeued first. + +FQ is non-work-conserving. + +TCP pacing is good for flows having idle times, as the congestion +window permits TCP stack to queue a possibly large number of packets. +This removes the 'slow start after idle' choice, badly hitting +large BDP flows and applications delivering chunks of data such as video streams. + +.SH PARAMETERS +.SS limit +Hard limit on the real queue size. When this limit is reached, new packets +are dropped. If the value is lowered, packets are dropped so that the new limit is +met. Default is 10000 packets. +.SS flow_limit +Hard limit on the maximum number of packets queued per flow. +Default value is 100. +.SS quantum +The credit per dequeue RR round, i.e. the amount of bytes a flow is allowed to +dequeue at once. A larger value means a longer time period before the next flow +will be served. +Default is 2 * interface MTU bytes. +.SS initial_quantum +The initial sending rate credit, i.e. the amount of bytes a new flow is allowed +to dequeue initially. +This is specifically meant to allow using IW10 without added delay. +Default is 10 * interface MTU, i.e. 15140 for 'standard' ethernet. +.SS maxrate +Maximum sending rate of a flow. Default is unlimited. +Application specific setting via +.B SO_MAX_PACING_RATE +is ignored only if it is larger than this value. +.SS buckets +The size of the hash table used for flow lookups. Each bucket is assigned a +red-black tree for efficient collision sorting. +Default: 1024. +.SS orphan_mask +For packets not owned by a socket, fq is able to mask a part of skb->hash +and reduce number of buckets associated with the traffic. This is a DDOS +prevention mechanism, and the default is 1023 (meaning no more than 1024 flows +are allocated for these packets) +.SS [no]pacing +Enable or disable flow pacing. Default is enabled. +.SS ce_threshold +sets a threshold above which all packets are marked with ECN Congestion +Experienced. This is useful for DCTCP-style congestion control algorithms that +require marking at very shallow queueing thresholds. + +.SH EXAMPLES +#tc qdisc add dev eth0 root fq ce_threshold 4ms +.br +#tc -s -d qdisc show dev eth0 +.br +qdisc fq 8001: dev eth0 root refcnt 2 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit refill_delay 40.0ms ce_threshold 4.0ms + Sent 72149092 bytes 48062 pkt (dropped 2176, overlimits 0 requeues 0) + backlog 1937920b 1280p requeues 0 + flows 34 (inactive 17 throttled 0) + gc 0 highprio 0 throttled 0 ce_mark 47622 flows_plimit 2176 +.br +.SH SEE ALSO +.BR tc (8), +.BR socket (7) +.SH AUTHORS +FQ was written by Eric Dumazet. diff --git a/man/man8/tc-fq_codel.8 b/man/man8/tc-fq_codel.8 new file mode 100644 index 0000000..7859063 --- /dev/null +++ b/man/man8/tc-fq_codel.8 @@ -0,0 +1,143 @@ +.TH FQ_CoDel 8 "4 June 2012" "iproute2" "Linux" +.SH NAME +CoDel \- Fair Queuing (FQ) with Controlled Delay (CoDel) +.SH SYNOPSIS +.B tc qdisc ... fq_codel +[ +.B limit +PACKETS ] [ +.B flows +NUMBER ] [ +.B target +TIME ] [ +.B interval +TIME ] [ +.B quantum +BYTES ] [ +.B ecn +| +.B noecn +] [ +.B ce_threshold +TIME ] [ +.B ce_threshold_selector +VALUE/MASK ] [ +.B memory_limit +BYTES ] + +.SH DESCRIPTION +FQ_Codel (Fair Queuing Controlled Delay) is queuing discipline that combines Fair +Queuing with the CoDel AQM scheme. FQ_Codel uses a stochastic model to classify +incoming packets into different flows and is used to provide a fair share of the +bandwidth to all the flows using the queue. Each such flow is managed by the +CoDel queuing discipline. Reordering within a flow is avoided since Codel +internally uses a FIFO queue. + +.SH PARAMETERS +.SS limit +has the same semantics as +.B codel +and is the hard limit on the real queue size. +When this limit is reached, incoming packets are dropped. Default is 10240 +packets. + +.SS memory_limit +sets a limit on the total number of bytes that can be queued in this FQ-CoDel +instance. The lower of the packet limit of the +.B limit +parameter and the memory limit will be enforced. Default is 32 MB. + + +.SS flows +is the number of flows into which the incoming packets are classified. Due to +the stochastic nature of hashing, multiple flows may end up being hashed into +the same slot. Newer flows have priority over older ones. This parameter can be +set only at load time since memory has to be allocated for the hash table. +Default value is 1024. + +.SS target +has the same semantics as +.B codel +and is the acceptable minimum +standing/persistent queue delay. This minimum delay is identified by tracking +the local minimum queue delay that packets experience. Default value is 5ms. + +.SS interval +has the same semantics as +.B codel +and is used to ensure that the measured minimum delay does not become too stale. +The minimum delay must be experienced in the last epoch of length +.BR interval . +It should be set on the order of the worst-case RTT through the bottleneck to +give endpoints sufficient time to react. Default value is 100ms. + +.SS quantum +is the number of bytes used as 'deficit' in the fair queuing algorithm. Default +is set to 1514 bytes which corresponds to the Ethernet MTU plus the hardware +header length of 14 bytes. + +.SS ecn | noecn +has the same semantics as +.B codel +and can be used to mark packets instead of dropping them. If +.B ecn +has been enabled, +.B noecn +can be used to turn it off and vice-a-versa. Unlike +.B codel, ecn +is turned on by default. + +.SS ce_threshold +sets a threshold above which all packets are marked with ECN Congestion +Experienced. This is useful for DCTCP-style congestion control algorithms that +require marking at very shallow queueing thresholds. + +.SS ce_threshold_selector +sets a filter so that the +.B ce_threshold +feature is applied to only a subset of the traffic seen by the qdisc. If set, the MASK value +will be applied as a bitwise AND to the diffserv/ECN byte of the IP header, and only if the +result of this masking equals VALUE, will the +.B ce_threshold +logic be applied to the packet. + +.SS drop_batch +sets the maximum number of packets to drop when +.B limit +or +.B memory_limit +is exceeded. Default value is 64. + +.SH EXAMPLES +#tc qdisc add dev eth0 root fq_codel +.br +#tc -s qdisc show +.br +qdisc fq_codel 8002: dev eth0 root refcnt 2 limit 10240p flows 1024 quantum 1514 + target 5.0ms interval 100.0ms ecn + Sent 428514 bytes 2269 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 + maxpacket 256 drop_overlimit 0 new_flow_count 0 ecn_mark 0 + new_flows_len 0 old_flows_len 0 + +#tc qdisc add dev eth0 root fq_codel limit 2000 target 3ms interval 40ms noecn +.br +#tc -s qdisc show +.br +qdisc fq_codel 8003: dev eth0 root refcnt 2 limit 2000p flows 1024 quantum 1514 +target 3.0ms interval 40.0ms + Sent 2588985006 bytes 1783629 pkt (dropped 0, overlimits 0 requeues 34869) + backlog 0b 0p requeues 34869 + maxpacket 65226 drop_overlimit 0 new_flow_count 73 ecn_mark 0 + new_flows_len 1 old_flows_len 3 + + +.SH SEE ALSO +.BR tc (8), +.BR tc-codel (8), +.BR tc-red (8) + +.SH AUTHORS +FQ_CoDel was implemented by Eric Dumazet. This manpage was written +by Vijay Subramanian. Please report corrections to the Linux Networking +mailing list <netdev@vger.kernel.org>. diff --git a/man/man8/tc-fq_pie.8 b/man/man8/tc-fq_pie.8 new file mode 100644 index 0000000..457a56b --- /dev/null +++ b/man/man8/tc-fq_pie.8 @@ -0,0 +1,166 @@ +.TH FQ-PIE 8 "23 January 2020" "iproute2" "Linux" + +.SH NAME + +FQ-PIE - Flow Queue Proportional Integral controller Enhanced + +.SH SYNOPSIS + +.B tc qdisc ... fq_pie +[ \fBlimit\fR PACKETS ] [ \fBflows\fR NUMBER ] +.br + \ +[ \fBtarget\fR TIME ] [ \fBtupdate\fR TIME ] +.br + \ +[ \fBalpha\fR NUMBER ] [ \fBbeta\fR NUMBER ] +.br + \ +[ \fBquantum\fR BYTES ] [ \fBmemory_limit\fR BYTES ] +.br + \ +[ \fBecn_prob\fR PERENTAGE ] [ [\fBno\fR]\fBecn\fR ] +.br + \ +[ [\fBno\fR]\fBbytemode\fR ] [ [\fBno_\fR]\fBdq_rate_estimator\fR ] + +.SH DESCRIPTION +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. + +.SH ALGORITHM +The FQ-PIE algorithm consists of two logical parts: the scheduler which selects +which queue to dequeue a packet from, and the PIE AQM which works on each of the +queues. The major work of FQ-PIE is mostly in the scheduling part. The +interaction between the scheduler and the PIE algorithm is straight forward. + +During the enqueue stage, a hashing-based scheme is used, where flows are hashed +into a number of buckets with each bucket having its own queue. The number of +buckets is configurable, and presently defaults to 1024 in the implementation. +The flow hashing is performed on the 5-tuple of source and destination IP +addresses, port numbers and IP protocol number. Once the packet has been +successfully classified into a queue, it is handed over to the PIE algorithm +for enqueuing. It is then added to the tail of the selected queue, and the +queue's byte count is updated by the packet size. If the queue is not currently +active (i.e., if it is not in either the list of new or the list of old queues) +, it is added to the end of the list of new queues, and its number of credits +is initiated to the configured quantum. Otherwise, the queue is left in its +current queue list. + +During the dequeue stage, the scheduler first looks at the list of new queues; +for the queue at the head of that list, if that queue has a negative number of +credits (i.e., it has already dequeued at least a quantum of bytes), it is given +an additional quantum of credits, the queue is put onto the end of the list of +old queues, and the routine selects the next queue and starts again. Otherwise, +that queue is selected for dequeue again. If the list of new queues is empty, +the scheduler proceeds down the list of old queues in the same fashion +(checking the credits, and either selecting the queue for dequeuing, or adding +credits and putting the queue back at the end of the list). After having +selected a queue from which to dequeue a packet, the PIE algorithm is invoked +on that queue. + +Finally, if the PIE algorithm does not return a packet, then the queue must be +empty and the scheduler does one of two things: + +If the queue selected for dequeue came from the list of new queues, it is moved +to the end of the list of old queues. If instead it came from the list of old +queues, that queue is removed from the list, to be added back (as a new queue) +the next time a packet arrives that hashes to that queue. Then (since no packet +was available for dequeue), the whole dequeue process is restarted from the +beginning. + +If, instead, the scheduler did get a packet back from the PIE algorithm, it +subtracts the size of the packet from the byte credits for the selected queue +and returns the packet as the result of the dequeue operation. + +.SH PARAMETERS +.SS limit +It is the limit on the queue size in packets. Incoming packets are dropped when +the limit is reached. The default value is 10240 packets. + +.SS flows +It is the number of flows into which the incoming packets are classified. Due +to the stochastic nature of hashing, multiple flows may end up being hashed +into the same slot. Newer flows have priority over older ones. This +parameter can be set only at load time since memory has to be allocated for +the hash table. The default value is 1024. + +.SS target +It is the queue delay which the PIE algorithm tries to maintain. The default +target delay is 15ms. + +.SS tupdate +It is the time interval at which the system drop probability is calculated. +The default is 15ms. + +.SS alpha +.SS beta +alpha and beta are parameters chosen to control the drop probability. These +should be in the range between 0 and 32. + +.SS quantum +quantum signifies the number of bytes that may be dequeued from a queue before +switching to the next queue in the deficit round robin scheme. + +.SS memory_limit +It is the maximum total memory allowed for packets of all flows. The default is +32Mb. + +.SS ecn_prob +It is the drop probability threshold below which packets will be ECN marked +instead of getting dropped. The default is 10%. Setting this parameter requires +\fBecn\fR to be enabled. + +.SS \fR[\fBno\fR]\fBecn\fR +It has the same semantics as \fBpie\fR and can be used to mark packets +instead of dropping them. If \fBecn\fR has been enabled, \fBnoecn\fR can +be used to turn it off and vice-a-versa. + +.SS \fR[\fBno\fR]\fBbytemode\fR +It is used to scale drop probability proportional to packet size +\fBbytemode\fR to turn on bytemode, \fBnobytemode\fR to turn off +bytemode. By default, \fBbytemode\fR is turned off. + +.SS \fR[\fBno_\fR]\fBdq_rate_estimator\fR +\fBdq_rate_estimator\fR can be used to calculate queue delay using Little's +Law, \fBno_dq_rate_estimator\fR can be used to calculate queue delay +using timestamp. By default, \fBdq_rate_estimator\fR is turned off. + +.SH EXAMPLES +# tc qdisc add dev eth0 root fq_pie +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 + Sent 159173586 bytes 105261 pkt (dropped 24, overlimits 0 requeues 0) + backlog 75700b 50p requeues 0 + pkts_in 105311 overlimit 0 overmemory 0 dropped 24 ecn_mark 0 + new_flow_count 7332 new_flows_len 0 old_flows_len 4 memory_used 108800 + +# tc qdisc add dev eth0 root fq_pie dq_rate_estimator +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 +dq_rate_estimator + Sent 8263620 bytes 5550 pkt (dropped 4, overlimits 0 requeues 0) + backlog 805448b 532p requeues 0 + pkts_in 6082 overlimit 0 overmemory 0 dropped 4 ecn_mark 0 + new_flow_count 94 new_flows_len 0 old_flows_len 8 memory_used 1157632 + +.SH SEE ALSO +.BR tc (8), +.BR tc-pie (8), +.BR tc-fq_codel (8) + +.SH SOURCES +RFC 8033: https://tools.ietf.org/html/rfc8033 + +.SH AUTHORS +FQ-PIE was implemented by Mohit P. Tahiliani. Please report corrections to the +Linux Networking mailing list <netdev@vger.kernel.org>. diff --git a/man/man8/tc-fw.8 b/man/man8/tc-fw.8 new file mode 100644 index 0000000..589505a --- /dev/null +++ b/man/man8/tc-fw.8 @@ -0,0 +1,104 @@ +.TH "Firewall mark classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +fw \- fwmark traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " fw " [ " classid +.IR CLASSID " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +the +.B fw +filter allows one to classify packets based on a previously set +.BR fwmark " by " iptables . +If the masked value of the +.B fwmark +matches the filter's masked +.BR handle , +the filter matches. By default, all 32 bits of the +.B handle +and the +.B fwmark +are masked. +.B iptables +allows one to mark single packets with the +.B MARK +target, or whole connections using +.BR CONNMARK . +The benefit of using this filter instead of doing the +heavy-lifting with +.B tc +itself is that on one hand it might be convenient to keep packet filtering and +classification in one place, possibly having to match a packet just once, and on +the other users familiar with +.BR iptables " but not " tc +will have a less hard time adding QoS to their setups. +.SH OPTIONS +.TP +.BI classid " CLASSID" +Push matching packets to the class identified by +.IR CLASSID . +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.SH EXAMPLES +Take e.g. the following tc filter statement: + +.RS +.EX +tc filter add ... handle 6 fw classid 1:1 +.EE +.RE + +will match if the packet's +.B fwmark +value is +.BR 6 . +This is a sample +.B iptables +statement marking packets coming in on eth0: + +.RS +.EX +iptables -t mangle -A PREROUTING -i eth0 -j MARK --set-mark 6 +.EE +.RE + +Specific bits of the packet's +.B fwmark +can be set using the +.B skbedit +action. For example, to only set one bit of the +.B fwmark +without changing any other bit: + +.RS +.EX +tc filter add ... action skbedit mark 0x8/0x8 +.EE +.RE + +The +.B fw +filter can then be used to match on this bit by masking the +.B handle: + +.RS +.EX +tc filter add ... handle 0x8/0x8 fw action drop +.EE +.RE + +This is useful when different bits of the +.B fwmark +are assigned different meanings. +.EE +.RE +.SH SEE ALSO +.BR tc (8), +.BR iptables (8), +.BR iptables-extensions (8), +.BR tc-skbedit (8) diff --git a/man/man8/tc-gact.8 b/man/man8/tc-gact.8 new file mode 100644 index 0000000..81aa30e --- /dev/null +++ b/man/man8/tc-gact.8 @@ -0,0 +1,85 @@ +.TH "Generic actions in tc" 8 "11 Jan 2023" "iproute2" "Linux" + +.SH NAME +gact - generic action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action gact" +.IR CONTROL " [ " RAND " ] [ " INDEX " ]" +.ti -8 +.IR CONTROL " := { " +.BR reclassify " | " drop " | " continue " | " pass " | " pipe " | " +.br +.BI "goto chain " "CHAIN_INDEX" +| +.br +.BI "jump " "JUMP_COUNT" +} + +.ti -8 +.IR RAND " := " +.BI random " RANDTYPE CONTROL VAL" +.ti -8 +.IR RANDTYPE " := { " +.BR netrand " | " determ " }" +.ti -8 +.IR VAL " := number not exceeding 10000" +.ti -8 +.IR JUMP_COUNT " := absolute jump from start of action list" +.ti -8 +.IR INDEX " := index value used" + +.SH DESCRIPTION +The +.B gact +action allows reclassify, dropping, passing, or accepting packets. +At the moment there are only two algorithms. One is deterministic +and the other uses internal kernel netrand. + +.SH OPTIONS +.TP +.BI random " RANDTYPE CONTROL VAL" +The probability of taking the action expressed in terms of 1 out of +.I VAL +packets. + +.TP +.I CONTROL +Indicate how +.B tc +should proceed if the packet matches. +For a description of the possible +.I CONTROL +values, see +.BR tc-actions (8). + +.SH EXAMPLES +Apply a rule on ingress to drop packets from a given source address. +.RS +.EX +# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ +10.0.0.9/32 flowid 1:16 action drop +.EE +.RE + +Allow 1 out 10 packets from source randomly using the netrand generator +.RS +.EX +# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ +10.0.0.9/32 flowid 1:16 action drop random netrand ok 10 +.EE +.RE + +Deterministically accept every second packet +.RS +.EX +# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ +10.0.0.9/32 flowid 1:16 action drop random determ ok 2 +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-actions (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-gate.8 b/man/man8/tc-gate.8 new file mode 100644 index 0000000..23d93ca --- /dev/null +++ b/man/man8/tc-gate.8 @@ -0,0 +1,123 @@ +.TH GATE 8 "12 Mar 2020" "iproute2" "Linux" +.SH NAME +gate \- Stream Gate Action +.SH SYNOPSIS +.B tc " ... " action gate +.ti +8 +.B [ base-time +BASETIME ] +.B [ clockid +CLOCKID ] +.ti +8 +.B sched-entry +<gate state> <interval 1> [ <internal priority> <max octets> ] +.ti +8 +.B sched-entry +<gate state> <interval 2> [ <internal priority> <max octets> ] +.ti +8 +.B sched-entry +<gate state> <interval 3> [ <internal priority> <max octets> ] +.ti +8 +.B ...... +.ti +8 +.B sched-entry +<gate state> <interval N> [ <internal priority> <max octets> ] + +.SH DESCRIPTION +GATE action allows specified ingress frames can be passed at +specific time slot, or be dropped at specific time slot. Tc filter +filters the ingress frames, then tc gate action would specify which time +slot and how many bytes these frames can be passed to device and +which time slot frames would be dropped. +Gate action also assign a base-time to tell when the entry list start. +Then gate action would start to repeat the gate entry list cyclically +at the start base-time. +For the software simulation, gate action requires the user assign reference +time clock type. + +.SH PARAMETERS + +.TP +base-time +.br +Specifies the instant in nanoseconds, defining the time when the schedule +starts. If 'base-time' is a time in the past, the schedule will start at + +base-time + (N * cycle-time) + +where N is the smallest integer so the resulting time is greater than +"now", and "cycle-time" is the sum of all the intervals of the entries +in the schedule. Without base-time specified, will default to be 0. + +.TP +clockid +.br +Specifies the clock to be used by qdisc's internal timer for measuring +time and scheduling events. Not valid if gate action is used for offloading +filter. +For example, tc filter command with +.B skip_sw +parameter. + +.TP +sched-entry +.br +There may multiple +.B sched-entry +parameters in a single schedule. Each one has the format: + +sched-entry <gate state> <interval> [ <internal priority> <max octets> ] + +.br +<gate state> means gate states. 'open' keep gate open, 'close' keep gate close. +.br +<interval> means how much nano seconds for this time slot. +.br +<internal priority> means internal priority value. Present of the +internal receiving queue for this stream. "-1" means wildcard. +<internal priority> and <max octets> can be omit default to be "-1" which both + value to be "-1" for this <sched-entry>. +.br +<max octets> means how many octets size could pass in this time slot. Dropped +if overlimited. "-1" means wildcard. <max octets> can be omit default to be +"-1" which value to be "-1" for this <sched-entry>. +.br +Note that <internal priority> and <max octets> are nothing meaning for gate state +is "close" in a "sched-entry". All frames are dropped when "sched-entry" with +"close" state. + +.SH EXAMPLES + +The following example shows tc filter frames source ip match to the +192.168.0.20 will keep the gate open for 200ms and limit the traffic to 8MB +in this sched-entry. Then keep the traffic gate to be close for 100ms. +Frames arrived at gate close state would be dropped. Then the cycle would +run the gate entries periodically. The schedule will start at instant 200.0s +using the reference CLOCK_TAI. The schedule is composed of two entries +each of 300ms duration. + +.EX +# tc qdisc add dev eth0 ingress +# tc filter add dev eth0 parent ffff: protocol ip \\ + flower skip_hw src_ip 192.168.0.20 \\ + action gate index 2 clockid CLOCK_TAI \\ + base-time 200000000000ns \\ + sched-entry open 200000000ns -1 8000000b \\ + sched-entry close 100000000ns + +.EE + +Following commands is an example to filter a stream source mac match to the +10:00:80:00:00:00 icmp frames will be dropped at any time with cycle 200ms. +With a default basetime 0 and clockid is CLOCK_TAI as default. + +.EX +# tc qdisc add dev eth0 ingress +# tc filter add dev eth0 parent ffff: protocol ip \\ + flower ip_proto icmp dst_mac 10:00:80:00:00:00 \\ + action gate index 12 sched-entry close 200000000ns + +.EE + +.SH AUTHORS +Po Liu <Po.Liu@nxp.com> diff --git a/man/man8/tc-hfsc.8 b/man/man8/tc-hfsc.8 new file mode 100644 index 0000000..fd0df8f --- /dev/null +++ b/man/man8/tc-hfsc.8 @@ -0,0 +1,61 @@ +.TH HFSC 8 "31 October 2011" iproute2 Linux +. +.SH NAME +HFSC \- Hierarchical Fair Service Curve's control under linux +. +.SH SYNOPSIS +.nf +tc qdisc add ... hfsc [ \fBdefault\fR CLASSID ] + +tc class add ... hfsc [ [ \fBrt\fR SC ] [ \fBls\fR SC ] | [ \fBsc\fR SC ] ] [ \fBul\fR SC ] + +\fBrt\fR : realtime service curve +\fBls\fR : linkshare service curve +\fBsc\fR : rt+ls service curve +\fBul\fR : upperlimit service curve + +\(bu at least one of \fBrt\fR, \fBls\fR or \fBsc\fR must be specified +\(bu \fBul\fR can only be specified with \fBls\fR or \fBsc\fR +. +.IP "SC := [ [ \fBm1\fR BPS ] \fBd\fR SEC ] \fBm2\fR BPS" +\fBm1\fR : slope of the first segment +\fBd\fR : x\-coordinate of intersection +\fBm2\fR : slope of the second segment +.PP +.IP "SC := [ [ \fBumax\fR BYTE ] \fBdmax\fR SEC ] \fBrate\fR BPS" +\fBumax\fR : maximum unit of work +\fBdmax\fR : maximum delay +\fBrate\fR : rate +.PP +.fi +For description of BYTE, BPS and SEC \- please see \fBUNITS\fR +section of \fBtc\fR(8). +. +.SH DESCRIPTION (qdisc) +HFSC qdisc has only one optional parameter \- \fBdefault\fR. CLASSID specifies +the minor part of the default classid, where packets not classified by other +means (e.g. u32 filter, CLASSIFY target of iptables) will be enqueued. If +\fBdefault\fR is not specified, unclassified packets will be dropped. +. +.SH DESCRIPTION (class) +HFSC class is used to create a class hierarchy for HFSC scheduler. For +explanation of the algorithm, and the meaning behind \fBrt\fR, \fBls\fR, +\fBsc\fR and \fBul\fR service curves \- please refer to \fBtc\-hfsc\fR(7). + +As you can see in \fBSYNOPSIS\fR, service curve (SC) can be specified in two +ways. Either as maximum delay for certain amount of work, or as a bandwidth +assigned for certain amount of time. Obviously, \fBm1\fR is simply +\fBumax\fR/\fBdmax\fR. + +Both \fBm2\fR and \fBrate\fR are mandatory. If you omit other +parameters, you will specify linear service curve. +. +.SH "SEE ALSO" +. +\fBtc\fR(8), \fBtc\-hfsc\fR(7), \fBtc\-stab\fR(8) + +Please direct bugreports and patches to: <netdev@vger.kernel.org> +. +.SH "AUTHOR" +. +Manpage created by Michal Soltys (soltys@ziu.info) diff --git a/man/man8/tc-htb.8 b/man/man8/tc-htb.8 new file mode 100644 index 0000000..59b159f --- /dev/null +++ b/man/man8/tc-htb.8 @@ -0,0 +1,171 @@ +.TH HTB 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +HTB \- Hierarchy Token Bucket +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] htb [ default +minor-id +.B ] [ r2q +divisor +.B ] [ offload ] + +.B tc class ... dev +dev +.B parent +major:[minor] +.B [ classid +major:minor +.B ] htb rate +rate +.B [ ceil +rate +.B ] burst +bytes +.B [ cburst +bytes +.B ] [ prio +priority +.B ] [ quantum +bytes +.B ] + +.SH DESCRIPTION +HTB allows control of the outbound bandwidth on a given link. +It allows simulating several slower links and to send different +kinds of traffic on different simulated links. In both cases, you have +to specify how to divide the physical link into simulated links and +how to decide which simulated link to use for a given packet to be sent. + +HTB shapes traffic based on the Token Bucket Filter algorithm +which does not depend on interface characteristics and so does not need to +know the underlying bandwidth of the outgoing interface. + +.SH SHAPING ALGORITHM +Shaping works as documented in +.B tc-tbf (8). + +.SH CLASSIFICATION +Within the one HTB instance many classes may exist. Each of these classes +contains another qdisc, by default +.BR tc-pfifo (8). + +When enqueueing a packet, HTB starts at the root and uses various methods to +determine which class should receive the data. + +In the absence of uncommon configuration options, the process is rather easy. +At each node we look for an instruction, and then go to the class the +instruction refers us to. If the class found is a barren leaf-node (without +children), we enqueue the packet there. If it is not yet a leaf node, we do +the whole thing over again starting from that node. + +The following actions are performed, in order at each node we visit, until one +sends us to another node, or terminates the process. +.TP +(i) +Consult filters attached to the class. If sent to a leafnode, we are done. +Otherwise, restart. +.TP +(ii) +If none of the above returned with an instruction, enqueue at this node. +.P +This algorithm makes sure that a packet always ends up somewhere, even while +you are busy building your configuration. + +.SH LINK SHARING ALGORITHM +FIXME + +.SH QDISC +The root of a HTB qdisc class tree has the following parameters: + +.TP +parent major:minor | root +This mandatory parameter determines the place of the HTB instance, either at the +.B root +of an interface or within an existing class. +.TP +handle major: +Like all other qdiscs, the HTB can be assigned a handle. Should consist only +of a major number, followed by a colon. Optional, but very useful if classes +will be generated within this qdisc. +.TP +default minor-id +Unclassified traffic gets sent to the class with this minor-id. +.TP +r2q divisor +Divisor used to calculate +.B quantum +values for classes. Classes divide +.B rate +by this number. Default value is 10. +.TP +offload +Offload the HTB algorithm to hardware (requires driver and device support). + +.SH CLASSES +Classes have a host of parameters to configure their operation. + +.TP +parent major:minor +Place of this class within the hierarchy. If attached directly to a qdisc +and not to another class, minor can be omitted. Mandatory. +.TP +classid major:minor +Like qdiscs, classes can be named. The major number must be equal to the +major number of the qdisc to which it belongs. Optional, but needed if this +class is going to have children. +.TP +prio priority +In the round-robin process, classes with the lowest priority field are tried +for packets first. + +.TP +rate rate +Maximum rate this class and all its children are guaranteed. Mandatory. + +.TP +ceil rate +Maximum rate at which a class can send, if its parent has bandwidth to spare. +Defaults to the configured rate, which implies no borrowing + +.TP +burst bytes +Amount of bytes that can be burst at +.B ceil +speed, in excess of the configured +.B rate. +Should be at least as high as the highest burst of all children. + +.TP +cburst bytes +Amount of bytes that can be burst at 'infinite' speed, in other words, as fast +as the interface can transmit them. For perfect evening out, should be equal to at most one average +packet. Should be at least as high as the highest cburst of all children. + +.TP +quantum bytes +Number of bytes to serve from this class before the scheduler moves to the next class. +Default value is +.B rate +divided by the qdisc +.B r2q +parameter. If specified, +.B r2q +is ignored. + +.SH NOTES +Due to Unix timing constraints, the maximum ceil rate is not infinite and may in fact be quite low. On Intel, +there are 100 timer events per second, the maximum rate is that rate at which 'burst' bytes are sent each timer tick. +From this, the minimum burst size for a specified rate can be calculated. For i386, a 10mbit rate requires a 12 kilobyte +burst as 100*12kb*8 equals 10mbit. + +.SH SEE ALSO +.BR tc (8) +.P +HTB website: http://luxik.cdi.cz/~devik/qos/htb/ +.SH AUTHOR +Martin Devera <devik@cdi.cz>. This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-ife.8 b/man/man8/tc-ife.8 new file mode 100644 index 0000000..fd2df6c --- /dev/null +++ b/man/man8/tc-ife.8 @@ -0,0 +1,143 @@ +.TH "IFE action in tc" 8 "22 Apr 2016" "iproute2" "Linux" + +.SH NAME +IFE - encapsulate/decapsulate metadata +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " " action ife" +.IR DIRECTION " [ " ACTION " ] " +.RB "[ " dst +.IR DMAC " ] " +.RB "[ " src +.IR SMAC " ] " +.RB "[ " type +.IR TYPE " ] " +.RI "[ " +.IR CONTROL " ] " +.RB "[ " index +.IR INDEX " ] " + +.ti -8 +.IR DIRECTION " := { " +.BR decode " | " encode " }" + +.ti -8 +.IR ACTION " := { " +.BI allow " ATTR" +.RB "| " use +.IR "ATTR value" " }" + +.ti -8 +.IR ATTR " := { " +.BR mark " | " prio " | " tcindex " }" + +.ti -8 +.IR CONTROL " := { " +.BR reclassify " | " use " | " pipe " | " drop " | " continue " | " ok " | " goto " " chain " " CHAIN_INDEX " }" +.SH DESCRIPTION +The +.B ife +action allows for a sending side to encapsulate arbitrary metadata, which is +then decapsulated by the receiving end. The sender runs in encoding mode and +the receiver in decode mode. Both sender and receiver must specify the same +ethertype. In the future, a registered ethertype may be available as a default. +.SH OPTIONS +.TP +.B decode +For the receiving side; decode the metadata if the packet matches. +.TP +.B encode +For the sending side. Encode the specified metadata if the packet matches. +.TP +.B allow +Encode direction only. Allows encoding specified metadata. +.TP +.B use +Encode direction only. Enforce static encoding of specified metadata. +.TP +.BR mark " [ " +.IR u32_value " ]" +The value to set for the skb mark. The u32 value is required only when +.BR use " is specified. If +.BR mark " value is zero, it will not be encoded, instead +"overlimits" statistics increment and +.BR CONTROL " action is taken. +.TP +.BR prio " [ " +.IR u32_value " ]" +The value to set for priority in the skb structure. The u32 value is required +only when +.BR use " is specified." +.TP +.BR tcindex " [" +.IR u16_value " ]" +Value to set for the traffic control index in the skb structure. The u16 value +is required only when +.BR use " is specified." +.TP +.BI dmac " DMAC" +.TQ +.BI smac " SMAC" +Optional six byte destination or source MAC address to encode. +.TP +.BI type " TYPE" +Optional 16-bit ethertype to encode. If not specified value of 0xED3E will be used. +.TP +.BI CONTROL +Action to take following an encode/decode. +.TP +.BI index " INDEX" +Assign a unique ID to this action instead of letting the kernel choose one +automatically. +.I INDEX +is a 32bit unsigned integer greater than zero. +.SH EXAMPLES + +On the receiving side, match packets with ethertype 0xdead and restart +classification so that it will match ICMP on the next rule, at prio 3: +.RS +.EX +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: prio 2 protocol 0xdead \\ + u32 match u32 0 0 flowid 1:1 \\ + action ife decode reclassify +# tc filter add dev eth0 parent ffff: prio 3 protocol ip \\ + u32 match ip protocol 0xff flowid 1:1 \\ + action continue +.EE +.RE + +Match with skb mark of 17: + +.RS +.EX +# tc filter add dev eth0 parent ffff: prio 4 protocol ip \\ + handle 0x11 fw flowid 1:1 \\ + action ok +.EE +.RE + +Configure the sending side to encode for the filters above. Use a destination +IP address of 192.168.122.237/24, then tag with skb mark of decimal 17. Encode +the packaet with ethertype 0xdead, add skb->mark to whitelist of metadatum to +send, and rewrite the destination MAC address to 02:15:15:15:15:15. + +.RS +.EX +# tc qdisc add dev eth0 root handle 1: prio +# tc filter add dev eth0 parent 1: protocol ip prio 10 u32 \\ + match ip dst 192.168.122.237/24 \\ + match ip protocol 1 0xff \\ + flowid 1:2 \\ + action skbedit mark 17 \\ + action ife encode \\ + type 0xDEAD \\ + allow mark \\ + dst 02:15:15:15:15:15 +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-matchall.8 b/man/man8/tc-matchall.8 new file mode 100644 index 0000000..d022406 --- /dev/null +++ b/man/man8/tc-matchall.8 @@ -0,0 +1,87 @@ +.TH "Match-all classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +matchall \- traffic control filter that matches every packet +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " matchall " [ " +.BR skip_sw " | " skip_hw +.RI " ] [ " +.B action +.IR ACTION_SPEC " ] [ " +.B classid +.IR CLASSID " ]" +.SH DESCRIPTION +The +.B matchall +filter allows one to classify every packet that flows on the port and run a +action on it. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI skip_sw +Do not process filter by software. If hardware has no offload support for this +filter, or TC offload is not enabled for the interface, operation will fail. +.TP +.BI skip_hw +Do not process filter by hardware. +.SH EXAMPLES +To create ingress mirroring from port eth1 to port eth2: +.RS +.EX + +tc qdisc add dev eth1 handle ffff: ingress +tc filter add dev eth1 parent ffff: \\ + matchall skip_sw \\ + action mirred egress mirror \\ + dev eth2 +.EE +.RE + +The first command creates an ingress qdisc with handle +.BR ffff: +on device +.BR eth1 +where the second command attaches a matchall filters on it that mirrors the +packets to device eth2. + +To create egress mirroring from port eth1 to port eth2: +.RS +.EX + +tc qdisc add dev eth1 handle 1: root prio +tc filter add dev eth1 parent 1: \\ + matchall skip_sw \\ + action mirred egress mirror \\ + dev eth2 +.EE +.RE + +The first command creates an egress qdisc with handle +.BR 1: +that replaces the root qdisc on device +.BR eth1 +where the second command attaches a matchall filters on it that mirrors the +packets to device eth2. + +To sample one of every 100 packets flowing into interface eth0 to psample group +12: +.RS +.EX + +tc qdisc add dev eth0 handle ffff: ingress +tc filter add dev eth0 parent ffff: matchall \\ + action sample rate 100 group 12 +.EE +.RE + +.EE +.SH SEE ALSO +.BR tc (8), diff --git a/man/man8/tc-mirred.8 b/man/man8/tc-mirred.8 new file mode 100644 index 0000000..e529fa6 --- /dev/null +++ b/man/man8/tc-mirred.8 @@ -0,0 +1,107 @@ +.TH "Mirror/redirect action in tc" 8 "11 Jan 2015" "iproute2" "Linux" + +.SH NAME +mirred - mirror/redirect action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action mirred" +.I DIRECTION ACTION +.RB "[ " index +.IR INDEX " ] " +.BI dev " DEVICENAME" + +.ti -8 +.IR DIRECTION " := { " +.BR ingress " | " egress " }" + +.ti -8 +.IR ACTION " := { " +.BR mirror " | " redirect " }" +.SH DESCRIPTION +The +.B mirred +action allows packet mirroring (copying) or redirecting (stealing) the packet it +receives. Mirroring is what is sometimes referred to as Switch Port Analyzer +(SPAN) and is commonly used to analyze and/or debug flows. +.SH OPTIONS +.TP +.B ingress +.TQ +.B egress +Specify the direction in which the packet shall appear on the destination +interface. +.TP +.B mirror +.TQ +.B redirect +Define whether the packet should be copied +.RB ( mirror ) +or moved +.RB ( redirect ) +to the destination interface. +.TP +.BI index " INDEX" +Assign a unique ID to this action instead of letting the kernel choose one +automatically. +.I INDEX +is a 32bit unsigned integer greater than zero. +.TP +.BI dev " DEVICENAME" +Specify the network interface to redirect or mirror to. +.SH EXAMPLES +Limit ingress bandwidth on eth0 to 1mbit/s, redirect exceeding traffic to lo for +debugging purposes: + +.RS +.EX +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + action police rate 1mbit burst 100k conform-exceed pipe \\ + action mirred egress redirect dev lo +.EE +.RE + +Mirror all incoming ICMP packets on eth0 to a dummy interface for examination +with e.g. tcpdump: + +.RS +.EX +# ip link add dummy0 type dummy +# ip link set dummy0 up +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: protocol ip \\ + u32 match ip protocol 1 0xff \\ + action mirred egress mirror dev dummy0 +.EE +.RE + +Using an +.B ifb +interface, it is possible to send ingress traffic through an instance of +.BR sfq : + +.RS +.EX +# modprobe ifb +# ip link set ifb0 up +# tc qdisc add dev ifb0 root sfq +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + action mirred egress redirect dev ifb0 +.EE +.RE + +.SH LIMITIATIONS +The kernel restricts nesting to four levels to avoid the chance +of nesting loops. +.PP +Do not redirect for one IFB device to another. +IFB is a very specialized case of packet redirecting device. +Redirecting from ifbX->ifbY will cause all packets to be dropped. + +.SH SEE ALSO +.BR tc (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-mpls.8 b/man/man8/tc-mpls.8 new file mode 100644 index 0000000..7f8be22 --- /dev/null +++ b/man/man8/tc-mpls.8 @@ -0,0 +1,194 @@ +.TH "MPLS manipulation action in tc" 8 "22 May 2019" "iproute2" "Linux" + +.SH NAME +mpls - mpls manipulation module +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action mpls" " { " +.IR POP " | " PUSH " | " MODIFY " | " +.BR dec_ttl " } [ " +.IR CONTROL " ]" + +.ti -8 +.IR POP " := " +.BR pop " " protocol +.IR MPLS_PROTO + +.ti -8 +.IR PUSH " := " +.RB "{ " push " | " mac_push " } [ " protocol +.IR MPLS_PROTO " ]" +.RB " [ " tc +.IR MPLS_TC " ] " +.RB " [ " ttl +.IR MPLS_TTL " ] " +.RB " [ " bos +.IR MPLS_BOS " ] " +.BI label " MPLS_LABEL" + +.ti -8 +.IR MODIFY " := " +.BR modify " [ " label +.IR MPLS_LABEL " ]" +.RB " [ " tc +.IR MPLS_TC " ] " +.RB " [ " ttl +.IR MPLS_TTL " ] " + +.ti -8 +.IR CONTROL " := { " +.BR reclassify " | " pipe " | " drop " | " continue " | " pass " | " goto " " chain " " CHAIN_INDEX " }" +.SH DESCRIPTION +The +.B mpls +action performs mpls encapsulation or decapsulation on a packet, reflected by the +operation modes +.IR POP ", " PUSH ", " MODIFY " and " DEC_TTL . +The +.I POP +mode requires the ethertype of the header that follows the MPLS header (e.g. +IPv4 or another MPLS). It will remove the outer MPLS header and replace the +ethertype in the MAC header with that passed. The +.IR PUSH " and " MODIFY +modes update the current MPLS header information or add a new header. +.IR PUSH +requires at least an +.IR MPLS_LABEL ". " +.I DEC_TTL +requires no arguments and simply subtracts 1 from the MPLS header TTL field. + +.SH OPTIONS +.TP +.B pop +Decapsulation mode. Requires the protocol of the next header. +.TP +.B push +Encapsulation mode. Adds the MPLS header between the MAC and the network +headers. Requires at least the +.B label +option. +.TP +.B mac_push +Encapsulation mode. Adds the MPLS header before the MAC header. Requires at +least the +.B label +option. +.TP +.B modify +Replace mode. Existing MPLS tag is replaced. +.BR label ", " +.BR tc ", " +and +.B ttl +are all optional. +.TP +.B dec_ttl +Decrement the TTL field on the outer most MPLS header. +.TP +.BI label " MPLS_LABEL" +Specify the MPLS LABEL for the outer MPLS header. +.I MPLS_LABEL +is an unsigned 20bit integer, the format is detected automatically (e.g. prefix +with +.RB ' 0x ' +for hexadecimal interpretation, etc.). +.TP +.BI protocol " MPLS_PROTO" +Choose the protocol to use. For push actions this must be +.BR mpls_uc " or " mpls_mc " (" mpls_uc +is the default). For pop actions it should be the protocol of the next header. +This option cannot be used with modify. +.TP +.BI tc " MPLS_TC" +Choose the TC value for the outer MPLS header. Decimal number in range of 0-7. +Defaults to 0. +.TP +.BI ttl " MPLS_TTL" +Choose the TTL value for the outer MPLS header. Number in range of 0-255. A +non-zero default value will be selected if this is not explicitly set. +.TP +.BI bos " MPLS_BOS" +Manually configure the bottom of stack bit for an MPLS header push. The default +is for TC to automatically set (or unset) the bit based on the next header of +the packet. +.TP +.I CONTROL +How to continue after executing this action. +.RS +.TP +.B reclassify +Restarts classification by jumping back to the first filter attached to this +action's parent. +.TP +.B pipe +Continue with the next action, this is the default. +.TP +.B drop +Packet will be dropped without running further actions. +.TP +.B continue +Continue classification with next filter in line. +.TP +.B pass +Return to calling qdisc for packet processing. This ends the classification +process. +.RE +.SH EXAMPLES +The following example encapsulates incoming IP packets on eth0 into MPLS with +a label 123 and sends them out eth1: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev eth0 protocol ip parent ffff: flower \\ + action mpls push protocol mpls_uc label 123 \\ + action mirred egress redirect dev eth1 +.EE +.RE + +In this example, incoming MPLS unicast packets on eth0 are decapsulated +and redirected to eth1: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev eth0 protocol mpls_uc parent ffff: flower \\ + action mpls pop protocol ipv4 \\ + action mirred egress redirect dev eth1 +.EE +.RE + +Here is another example, where incoming Ethernet frames are encapsulated into +MPLS with label 123 and TTL 64. Then, an outer Ethernet header is added and the +resulting frame is finally sent on eth1: + +.RS +.EX +#tc qdisc add dev eth0 ingress +#tc filter add dev eth0 ingress matchall \\ + action mpls mac_push label 123 ttl 64 \\ + action vlan push_eth \\ + dst_mac 02:00:00:00:00:02 \\ + src_mac 02:00:00:00:00:01 \\ + action mirred egress redirect dev eth1 +.EE +.RE + +The following example assumes that incoming MPLS packets with label 123 +transport Ethernet frames. The outer Ethernet and the MPLS headers are +stripped, then the inner Ethernet frame is sent on eth1: + +.RS +.EX +#tc qdisc add dev eth0 ingress +#tc filter add dev eth0 ingress protocol mpls_uc \\ + flower mpls_label 123 mpls_bos 1 \\ + action vlan pop_eth \\ + action mpls pop protocol teb \\ + action mirred egress redirect dev eth1 +.EE +.RE + +.SH SEE ALSO +.BR tc "(8), " tc-mirred "(8), " tc-vlan (8) diff --git a/man/man8/tc-mqprio.8 b/man/man8/tc-mqprio.8 new file mode 100644 index 0000000..724ef90 --- /dev/null +++ b/man/man8/tc-mqprio.8 @@ -0,0 +1,290 @@ +.TH MQPRIO 8 "24 Sept 2013" "iproute2" "Linux" +.SH NAME +MQPRIO \- Multiqueue Priority Qdisc (Offloaded Hardware QOS) +.SH SYNOPSIS +.B tc qdisc ... dev +dev ( +.B parent +classid | root) [ +.B handle +major: ] +.B mqprio +.ti +8 +[ +.B num_tc +tcs ] [ +.B map +P0 P1 P2... ] [ +.B queues +count1@offset1 count2@offset2 ... ] +.ti +8 +[ +.B hw +1|0 ] [ +.B mode +dcb|channel ] [ +.B shaper +dcb|bw_rlimit ] +.ti +8 +[ +.B min_rate +min_rate1 min_rate2 ... ] [ +.B max_rate +max_rate1 max_rate2 ... ] +.ti +8 +[ +.B fp +FP0 FP1 FP2 ... ] + +.SH DESCRIPTION +The MQPRIO qdisc is a simple queuing discipline that allows mapping +traffic flows to hardware queue ranges using priorities and a configurable +priority to traffic class mapping. A traffic class in this context is +a set of contiguous qdisc classes which map 1:1 to a set of hardware +exposed queues. + +By default the qdisc allocates a pfifo qdisc (packet limited first in, first +out queue) per TX queue exposed by the lower layer device. Other queuing +disciplines may be added subsequently. Packets are enqueued using the +.B map +parameter and hashed across the indicated queues in the +.B offset +and +.B count. +By default these parameters are configured by the hardware +driver to match the hardware QOS structures. + +.B Channel +mode supports full offload of the mqprio options, the traffic classes, the queue +configurations and QOS attributes to the hardware. Enabled hardware can provide +hardware QOS with the ability to steer traffic flows to designated traffic +classes provided by this qdisc. Hardware based QOS is configured using the +.B shaper +parameter. +.B bw_rlimit +with minimum and maximum bandwidth rates can be used for setting +transmission rates on each traffic class. Also further qdiscs may be added +to the classes of MQPRIO to create more complex configurations. + +.SH ALGORITHM +On creation with 'tc qdisc add', eight traffic classes are created mapping +priorities 0..7 to traffic classes 0..7 and priorities greater than 7 to +traffic class 0. This requires base driver support and the creation will +fail on devices that do not support hardware QOS schemes. + +These defaults can be overridden using the qdisc parameters. Providing +the 'hw 0' flag allows software to run without hardware coordination. + +If hardware coordination is being used and arguments are provided that +the hardware can not support then an error is returned. For many users +hardware defaults should work reasonably well. + +As one specific example numerous Ethernet cards support the 802.1Q +link strict priority transmission selection algorithm (TSA). MQPRIO +enabled hardware in conjunction with the classification methods below +can provide hardware offloaded support for this TSA. + +.SH CLASSIFICATION +Multiple methods are available to set the SKB priority which MQPRIO +uses to select which traffic class to enqueue the packet. +.TP +From user space +A process with sufficient privileges can encode the destination class +directly with SO_PRIORITY, see +.BR socket(7). +.TP +with iptables/nftables +An iptables/nftables rule can be created to match traffic flows and +set the priority. +.BR iptables(8) +.TP +with net_prio cgroups +The net_prio cgroup can be used to set the priority of all sockets +belong to an application. See kernel and cgroup documentation for details. + +.SH QDISC PARAMETERS +.TP +num_tc +Number of traffic classes to use. Up to 16 classes supported. +You cannot have more classes than queues + +.TP +map +The priority to traffic class map. Maps priorities 0..15 to a specified +traffic class. + +.TP +queues +Provide count and offset of queue range for each traffic class. In the +format, +.B count@offset. +Queue ranges for each traffic classes cannot overlap and must be a +contiguous range of queues. + +.TP +hw +Set to +.B 1 +to support hardware offload. Set to +.B 0 +to configure user specified values in software only. +The default value of this parameter is +.B 1 + +.TP +mode +Set to +.B channel +for full use of the mqprio options. Use +.B dcb +to offload only TC values and use hardware QOS defaults. Supported with 'hw' +set to 1 only. + +.TP +shaper +Use +.B bw_rlimit +to set bandwidth rate limits for a traffic class. Use +.B dcb +for hardware QOS defaults. Supported with 'hw' set to 1 only. + +.TP +min_rate +Minimum value of bandwidth rate limit for a traffic class. Supported only when +the +.B 'shaper' +argument is set to +.B 'bw_rlimit'. + +.TP +max_rate +Maximum value of bandwidth rate limit for a traffic class. Supported only when +the +.B 'shaper' +argument is set to +.B 'bw_rlimit'. + +.TP +fp +Selects whether traffic classes are express (deliver packets via the eMAC) or +preemptible (deliver packets via the pMAC), according to IEEE 802.1Q-2018 +clause 6.7.2 Frame preemption. Takes the form of an array (one element per +traffic class) with values being +.B 'E' +(for express) or +.B 'P' +(for preemptible). + +Multiple priorities which map to the same traffic class, as well as multiple +TXQs which map to the same traffic class, must have the same FP attributes. +To interpret the FP as an attribute per priority, the +.B 'map' +argument can be used for translation. To interpret FP as an attribute per TXQ, +the +.B 'queues' +argument can be used for translation. + +Traffic classes are express by default. The argument is supported only with +.B 'hw' +set to 1. Preemptible traffic classes are accepted only if the device has a MAC +Merge layer configurable through +.BR ethtool(8). + +.SH SEE ALSO +.BR ethtool(8) + +.SH EXAMPLE + +The following example shows how to attach priorities to 4 traffic classes ("num_tc 4"), +and then how to pair these traffic classes with 4 hardware queues with mqprio, +with hardware coordination ("hw 1", or does not specified, because 1 is the default value). +Traffic class 0 (tc0) is mapped to hardware queue 0 (q0), tc1 is mapped to q1, +tc2 is mapped to q2, and tc3 is mapped q3. + +.EX +# tc qdisc add dev eth0 root mqprio \ + num_tc 4 \ + map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 \ + queues 1@0 1@1 1@2 1@3 \ + hw 1 +.EE + +The next example shows how to attach priorities to 3 traffic classes ("num_tc 3"), +and how to pair these traffic classes with 4 queues, +without hardware coordination ("hw 0"). +Traffic class 0 (tc0) is mapped to hardware queue 0 (q0), tc1 is mapped to q1, +tc2 and is mapped to q2 and q3, where the queue selection between these +two queues is somewhat randomly decided. + +.EX +# tc qdisc add dev eth0 root mqprio \ + num_tc 3 \ + map 0 0 0 0 1 1 1 1 2 2 2 2 2 2 2 2 \ + queues 1@0 1@1 2@2 \ + hw 0 +.EE + + +In both cases from above the priority values from 0 to 3 (prio0-3) are +mapped to tc0, prio4-7 are mapped to tc1, and the +prio8-11 are mapped to tc2 ("map" attribute). The last four priority values +(prio12-15) are mapped in different ways in the two examples. +They are mapped to tc3 in the first example and mapped to tc2 in the second example. +The values of these two examples are the following: + + ┌────┬────┬───────┐ ┌────┬────┬────────┐ + │Prio│ tc │ queue │ │Prio│ tc │ queue │ + ├────┼────┼───────┤ ├────┼────┼────────┤ + │ 0 │ 0 │ 0 │ │ 0 │ 0 │ 0 │ + │ 1 │ 0 │ 0 │ │ 1 │ 0 │ 0 │ + │ 2 │ 0 │ 0 │ │ 2 │ 0 │ 0 │ + │ 3 │ 0 │ 0 │ │ 3 │ 0 │ 0 │ + │ 4 │ 1 │ 1 │ │ 4 │ 1 │ 1 │ + │ 5 │ 1 │ 1 │ │ 5 │ 1 │ 1 │ + │ 6 │ 1 │ 1 │ │ 6 │ 1 │ 1 │ + │ 7 │ 1 │ 1 │ │ 7 │ 1 │ 1 │ + │ 8 │ 2 │ 2 │ │ 8 │ 2 │ 2 or 3 │ + │ 9 │ 2 │ 2 │ │ 9 │ 2 │ 2 or 3 │ + │ 10 │ 2 │ 2 │ │ 10 │ 2 │ 2 or 3 │ + │ 11 │ 2 │ 2 │ │ 11 │ 2 │ 2 or 3 │ + │ 12 │ 3 │ 3 │ │ 12 │ 2 │ 2 or 3 │ + │ 13 │ 3 │ 3 │ │ 13 │ 2 │ 2 or 3 │ + │ 14 │ 3 │ 3 │ │ 14 │ 2 │ 2 or 3 │ + │ 15 │ 3 │ 3 │ │ 15 │ 2 │ 2 or 3 │ + └────┴────┴───────┘ └────┴────┴────────┘ + example1 example2 + + +Another example of queue mapping is the following. +There are 5 traffic classes, and there are 8 hardware queues. + +.EX +# tc qdisc add dev eth0 root mqprio \ + num_tc 5 \ + map 0 0 0 1 1 1 1 2 2 3 3 4 4 4 4 4 \ + queues 1@0 2@1 1@3 1@4 3@5 +.EE + +The value mapping is the following for this example: + + ┌───────┐ + tc0────┤Queue 0│◄────1@0 + ├───────┤ + ┌─┤Queue 1│◄────2@1 + tc1──┤ ├───────┤ + └─┤Queue 2│ + ├───────┤ + tc2────┤Queue 3│◄────1@3 + ├───────┤ + tc3────┤Queue 4│◄────1@4 + ├───────┤ + ┌─┤Queue 5│◄────3@5 + │ ├───────┤ + tc4──┼─┤Queue 6│ + │ ├───────┤ + └─┤Queue 7│ + └───────┘ + + +.SH AUTHORS +John Fastabend, <john.r.fastabend@intel.com> diff --git a/man/man8/tc-nat.8 b/man/man8/tc-nat.8 new file mode 100644 index 0000000..f3b17ef --- /dev/null +++ b/man/man8/tc-nat.8 @@ -0,0 +1,78 @@ +.TH "NAT action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +nat - stateless native address translation action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action nat" +.I DIRECTION OLD NEW + +.ti -8 +.IR DIRECTION " := { " +.BR ingress " | " egress " }" + +.ti -8 +.IR OLD " := " IPV4_ADDR_SPEC + +.ti -8 +.IR NEW " := " IPV4_ADDR_SPEC + +.ti -8 +.IR IPV4_ADDR_SPEC " := { " +.BR default " | " any " | " all " | " +\fIin_addr\fR[\fB/\fR{\fIprefix\fR|\fInetmask\fR}] +.SH DESCRIPTION +The +.B nat +action allows one to perform NAT without the overhead of conntrack, which is +desirable if the number of flows or addresses to perform NAT on is large. This +action is best used in combination with the +.B u32 +filter to allow for efficient lookups of a large number of stateless NAT rules +in constant time. +.SH OPTIONS +.TP +.B ingress +Translate destination addresses, i.e. perform DNAT. +.TP +.B egress +Translate source addresses, i.e. perform SNAT. +.TP +.I OLD +Specifies addresses which should be translated. +.TP +.I NEW +Specifies addresses which +.I OLD +should be translated into. +.SH NOTES +The accepted address format in +.IR OLD " and " NEW +is quite flexible. It may either consist of one of the keywords +.BR default ", " any " or " all , +representing the all-zero IP address or a combination of IP address and netmask +or prefix length separated by a slash +.RB ( / ) +sign. In any case, the mask (or prefix length) value of +.I OLD +is used for +.I NEW +as well so that a one-to-one mapping of addresses is assured. + +Address translation is done using a combination of binary operations. First, the +original (source or destination) address is matched against the value of +.IR OLD . +If the original address fits, the new address is created by taking the leading +bits from +.I NEW +(defined by the netmask of +.IR OLD ) +and taking the remaining bits from the original address. + +There is rudimental support for upper layer protocols, namely TCP, UDP and ICMP. +While for the first two only checksum recalculation is performed, the action +also takes care of embedded IP headers in ICMP packets by translating the +respective address therein, too. +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-netem.8 b/man/man8/tc-netem.8 new file mode 100644 index 0000000..a4cc0d6 --- /dev/null +++ b/man/man8/tc-netem.8 @@ -0,0 +1,405 @@ +.TH NETEM 8 "25 November 2011" "iproute2" "Linux" +.SH NAME +netem \- Network Emulator +.SH SYNOPSIS +.B "tc qdisc ... dev" +.IR DEVICE " ] " +.BR "add netem" +.I OPTIONS + +.IR OPTIONS " := [ " LIMIT " ] [ " DELAY " ] [ " LOSS \ +" ] [ " CORRUPT " ] [ " DUPLICATION " ] [ " REORDERING " ] [ " RATE \ +" ] [ " SLOT " ] [ " SEED " ]" + +.IR LIMIT " := " +.B limit +.I packets + +.IR DELAY " := " +.BI delay +.IR TIME " [ " JITTER " [ " CORRELATION " ]]]" +.br + [ +.BR distribution " { "uniform " | " normal " | " pareto " | " paretonormal " } ]" + +.IR LOSS " := " +.BR loss " { " +.BI random +.IR PERCENT " [ " CORRELATION " ] |" +.br +.RB " " state +.IR p13 " [ " p31 " [ " p32 " [ " p23 " [ " p14 "]]]] |" +.br +.RB " " gemodel +.IR p " [ " r " [ " 1-h " [ " 1-k " ]]] } " +.RB " [ " ecn " ] " + +.IR CORRUPT " := " +.B corrupt +.IR PERCENT " [ " CORRELATION " ]]" + +.IR DUPLICATION " := " +.B duplicate +.IR PERCENT " [ " CORRELATION " ]]" + +.IR REORDERING " := " +.B reorder +.IR PERCENT " [ " CORRELATION " ] [ " +.B gap +.IR DISTANCE " ]" + +.IR RATE " := " +.B rate +.IR RATE " [ " PACKETOVERHEAD " [ " CELLSIZE " [ " CELLOVERHEAD " ]]]]" + +.IR SLOT " := " +.BR slot " { " +.IR MIN_DELAY " [ " MAX_DELAY " ] |" +.br +.RB " " distribution " { "uniform " | " normal " | " pareto " | " paretonormal " | " +.IR FILE " } " DELAY " " JITTER " } " +.br +.RB " [ " packets +.IR PACKETS " ] [ " +.BR bytes +.IR BYTES " ]" + +.IR SEED " := " +.B seed +.I VALUE + +.SH DESCRIPTION +The +.B netem +queue discipline provides Network Emulation functionality +for testing protocols by emulating the properties of real-world networks. + +The queue discipline provides one or more network impairments to packets +such as: delay, loss, duplication, and packet corruption. + +.SH OPTIONS +.TP +.BI limit " COUNT" +Limits the maximum number of packets the qdisc may hold when doing delay. + +.TP +.B delay +.IR TIME " [ " JITTER " [ " CORRELATION " ]]]" +.br +Delays the packets before sending. +The optional parameters allow introducing a delay variation and a correlation. +Delay and jitter values are expressed in milliseconds; +Correlation is set by specifying a percent of how much the previous delay +will impact the current random value. + +.TP +.BI distribution " TYPE" +Specifies a pattern for delay distribution. +.RS +.TP +.B uniform +Use an equally weighted distribution of packet delays. +.TP +.B normal +Use a Gaussian distribution of delays. +Sometimes called a Bell Curve. +.TP +.B pareto +Use a Pareto distribution of packet delays. +This is useful to emulate long-tail distributions. +.TP +.B paretonormal +This is a mix of +.B pareto +and +.B normal +distribution which has properties of both Bell curve and long tail. +.RE + +.TP +.BI loss " MODEL" +Drop packets based on a loss model. +.I MODEL +can be one of +.RS +.TP +.BI random " PERCENT" +Each packet loss is independent. +.TP +.BI state " P13 [ P31 [ P32 [ P23 P14 ]]]" +Use a 4-state Markov chain to describe packet loss. +.br +.I P13 +is the packet loss. +Optional parameters extend the model to 2-state +.IR P31 , +3-state +.IR P23 , +.I P32 +and 4-state +.IR P14 . + +The Markov chain states are: +.RS +.TP +.B 1 +good packet reception (no loss). +.TP +.B 2 +good reception within a burst. +.TP +.B 3 +burst losses. +.TP +.B 4 +independent losses. +.RE + +.TP +.BI gemodel " PERCENT [ R [ 1-H [ 1-K ]]]" +Use a Gilbert-Elliot (burst loss) model +based on: +.RS +.TP +.I PERCENT +probability of starting bad (lossy) state. +.TP +.I R +probability of exiting bad state. +.TP +.I "1-H" +loss probability in bad state. +.TP +.I "1-K" +loss probability in good state. +.RE +.RE + +.TP +.B ecn +Use +Explicit Congestion Notification (ECN) +to mark packets instead of dropping them. +A loss model has to be used for this to be enabled. +.TP +.BI corrupt " PERCENT" +modifies the contents of the packet at a random position +based on +.IR PERCENT . +.TP +.BI duplicate " PERCENT" +creates a copy of the packet before queuing. +.TP +.BI reorder " PERCENT" +modifies the order of packet in the queue. +.TP +.BI gap " DISTANCE" +sends some packets immediately. +The first packets +.I "(DISTANCE - 1)" +are delayed and the next packet is sent immediately. + +.TP +.BI rate " RATE [ PACKETOVERHEAD [ CELLSIZE [ CELLOVERHEAD ]]]" +Delays packets based on packet size to emulate a fixed link speed. +Optional parameters: +.RS +.TP +.I PACKETOVERHEAD +Specify a per packet overhead in bytes. +Used to simulate additional link layer headers. +A negative value can be used to simlate when the Ethernet header is +stripped (e.g. -14) or header compression is used. +.TP +.I CELLSIZE +simulate link layer schemes like ATM. +.TP +.I CELLOVERHEAD +specify per cell overhead. +.RE + +Rate throttling impacted by several factors including the kernel clock +granularity. This will show up in an artificial packet compression (bursts). + +.TP +.BI slot " MIN_DELAY [ MAX_DELAY ]" +allows emulating slotted networks. +Defer delivering accumulated packets to within a slot. +Each available slot is configured with a minimum delay to acquire, +and an optional maximum delay. +.TP +.B slot distribution +allows configuring based on distribution similar to +.B distribution +option for packet delays. + +These slot options can provide a crude approximation of bursty MACs such as +DOCSIS, WiFi, and LTE. + +Slot emulation is limited by several factors: the kernel clock granularity, +as with a rate, and attempts to deliver many packets within a slot will be +smeared by the timer resolution, and by the underlying native bandwidth also. + +It is possible to combine slotting with a rate, in which case complex behaviors +where either the rate, or the slot limits on bytes or packets per slot, govern +the actual delivered rate. + +.TP +.BI seed " VALUE" +Specifies a seed to guide and reproduce the randomly generated +loss or corruption events. + +.SH LIMITATIONS +Netem is limited by the timer granularity in the kernel. +Rate and delay maybe impacted by clock interrupts. +.PP +Mixing forms of reordering may lead to unexpected results. +For any method of reordering to work, some delay is necessary. +If the delay is less than the inter-packet arrival time then +no reordering will be seen. +Due to mechanisms like TSQ (TCP Small Queues), for TCP performance test +results to be realistic netem must be placed on the ingress of the +receiver host. +.PP +Combining netem with other qdisc is possible but may not always +work because netem use skb control block to set delays. + +.SH EXAMPLES +.PP +.EX +# tc qdisc add dev eth0 root netem delay 100ms +.EE +.RS 4 +Add fixed amount of delay to all packets going out on device eth0. +Each packet will have added delay of 100ms ± 10ms. +.RE +.PP +.EX +# tc qdisc change dev eth0 root netem delay 100ms 10ms 25% +.EE +.RS 4 +This causes the added delay of 100ms ± 10ms +and the next packet delay value will be biased by 25% on the most recent delay. +This isn't a true statistical correlation, but an approximation. +.RE +.PP +.EX +# tc qdisc change dev eth0 root netem delay 100ms 20ms distribution normal +.EE +.RS 4 +This delays packets according to a normal distribution (Bell curve) +over a range of 100ms ± 20ms. +.RE +.PP +.EX +# tc qdisc change dev eth0 root netem loss 0.1% +.EE +.RS 4 +This causes 1/10th of a percent (i.e 1 out of 1000) packets to be +randomly dropped. + +An optional correlation may also be added. +This causes the random number generator to be less random and can be used to emulate packet burst losses. +.RE +.PP +.EX +# tc qdisc change dev eth0 root netem duplicate 1% +.EE +.RS 4 +This causes one percent of the packets sent on eth0 to be duplicated. +.RE +.PP +.EX +# tc qdisc change dev eth0 root netem loss 0.3% 25% +.EE +.RS 4 +This will cause 0.3% of packets to be lost, +and each successive probability depends is biased by 25% of the previous one. +.RE +.PP +There are two different ways to specify reordering. +The gap method uses a fixed sequence and reorders every Nth packet. +.EX +# tc qdisc change dev eth0 root netem gap 5 delay 10ms +.EE +.RS 4 +This causes every 5th (10th, 15th, …) packet to go to be sent immediately +and every other packet to be delayed by 10ms. +This is predictable and useful for base protocol testing like reassembly. +.RE +.PP +The reorder form uses a percentage of the packets to get misordered. +.EX +# tc qdisc change dev eth0 root netem delay 10ms reorder 25% 50% +.EE +In this example, 25% of packets (with a correlation of 50%) will get sent immediately, others will be delayed by 10ms. +.PP +Packets will also get reordered if jitter is large enough. +.EX +# tc qdisc change dev eth0 root netem delay 100ms 75ms +.EE +.RS 4 +If the first packet gets a random delay of 100ms (100ms base - 0ms jitter) +and the second packet is sent 1ms later and gets a delay of 50ms (100ms base - 50ms jitter); +the second packet will be sent first. +This is because the queue discipline tfifo inside netem, +keeps packets in order by time to send. +.RE +.PP +If you don't want this behavior then replace the internal +queue discipline tfifo with a simple FIFO queue discipline. +.EX +# tc qdisc add dev eth0 root handle 1: netem delay 10ms 100ms +# tc qdisc add dev eth0 parent 1:1 pfifo limit 1000 +.EE + +.PP +Example of using rate control and cells size. +.EX +# tc qdisc add dev eth0 root netem rate 5kbit 20 100 5 +.EE +.RS 4 +Delay all outgoing packets on device eth0 with a rate of 5kbit, a per packet +overhead of 20 byte, a cellsize of 100 byte and a per celloverhead of 5 bytes. +.RE + +.PP +It is possible to selectively apply impairment using traffic classification. +.EX +# tc qdisc add dev eth0 root handle 1: prio +# tc qdisc add dev eth0 parent 1:3 handle 30: \ + tbf rate 20kbit buffer 1600 limit 3000 +# tc qdisc add dev eth0 parent 30:1 handle 31: \ + netem delay 200ms 10ms distribution normal +# tc filter add dev eth0 protocol ip parent 1:0 prio 3 u32 \ + match ip dst 65.172.181.4/32 flowid 1:3 +.EE +.RS 4 +This example uses a priority queueing discipline; +a TBF is added to do rate control; and a simple netem delay. +A filter classifies all packets going to 65.172.181.4 as being priority 3. +.PP +.SH SOURCES +.IP " 1. " 4 +Hemminger S. , "Network Emulation with NetEm", Open Source Development Lab, +April 2005 +.UR http://devresources.linux-foundation.org/shemminger/netem/LCA2005_paper.pdf +.UE + +.IP " 2. " 4 +Salsano S., Ludovici F., Ordine A., "Definition of a general and intuitive loss +model for packet networks and its implementation in the Netem module in the +Linux kernel", available at +.UR http://netgroup.uniroma2.it/NetemCLG +.UE + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Netem was written by Stephen Hemminger at Linux foundation and was +inspired by NISTnet. + +Original manpage was created by Fabio Ludovici +<fabio.ludovici at yahoo dot it> and Hagen Paul Pfeifer +<hagen@jauu.net>. diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8 new file mode 100644 index 0000000..2ea4292 --- /dev/null +++ b/man/man8/tc-pedit.8 @@ -0,0 +1,402 @@ +.TH "Generic packet editor action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +pedit - generic packet editor action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action pedit [ex] munge " { +.IR RAW_OP " | " LAYERED_OP " | " EXTENDED_LAYERED_OP " } [ " CONTROL " ]" + +.ti -8 +.IR RAW_OP " := " +.BI offset " OFFSET" +.RB "{ " u8 " | " u16 " | " u32 " } [" +.IR AT_SPEC " ] " CMD_SPEC + +.ti -8 +.IR AT_SPEC " := " +.BI at " AT " offmask " MASK " shift " SHIFT" + +.ti -8 +.IR LAYERED_OP " := { " +.BI ip " IPHDR_FIELD" +| +.BI ip " BEYOND_IPHDR_FIELD" +.RI } " CMD_SPEC" + +.ti -8 +.IR EXTENDED_LAYERED_OP " := { " +.BI eth " ETHHDR_FIELD" +| +.BI ip " IPHDR_FIELD" +| +.BI ip " EX_IPHDR_FIELD" +| +.BI ip6 " IP6HDR_FIELD" +| +.BI tcp " TCPHDR_FIELD" +| +.BI udp " UDPHDR_FIELD" +.RI } " CMD_SPEC" + +.ti -8 +.IR ETHHDR_FIELD " := { " +.BR src " | " dst " | " type " }" + +.ti -8 +.IR IPHDR_FIELD " := { " +.BR src " | " dst " | " tos " | " dsfield " | " ihl " | " protocol " |" +.BR precedence " | " nofrag " | " firstfrag " | " ce " | " df " }" + +.ti -8 +.IR BEYOND_IPHDR_FIELD " := { " +.BR dport " | " sport " | " icmp_type " | " icmp_code " }" + +.ti -8 +.IR EX_IPHDR_FIELD " := { " +.BR ttl " }" + + +.ti -8 +.IR IP6HDR_FIELD " := { " +.BR src " | " dst " | " traffic_class " | " flow_lbl " | " payload_len " | " +.BR nexthdr " | " hoplimit " }" + +.ti -8 +.IR TCPHDR_FIELD " := { " +.BR sport " | " dport " | " flags " }" + +.ti -8 +.IR UDPHDR_FIELD " := { " +.BR sport " | " dport " }" + +.ti -8 +.IR CMD_SPEC " := {" +.BR clear " | " invert " | " set +.IR VAL " | " +.BR add +.IR VAL " | " +.BR decrement " | " +.BR preserve " } [ " retain +.IR RVAL " ]" + +.ti -8 +.IR CONTROL " := {" +.BR reclassify " | " pipe " | " drop " | " shot " | " continue " | " pass " | " goto " " chain " " CHAIN_INDEX " }" +.SH DESCRIPTION +The +.B pedit +action can be used to change arbitrary packet data. The location of data to +change can either be specified by giving an offset and size as in +.IR RAW_OP , +or for header values by naming the header and field to edit the size is then +chosen automatically based on the header field size. +.SH OPTIONS +.TP +.B ex +Use extended pedit. +.I EXTENDED_LAYERED_OP +and the add/decrement +.I CMD_SPEC +are allowed only in this mode. +.TP +.BI offset " OFFSET " "\fR{ \fBu32 \fR| \fBu16 \fR| \fBu8 \fR}" +Specify the offset at which to change data. +.I OFFSET +is a signed integer, it's base is automatically chosen (e.g. hex if prefixed by +.B 0x +or octal if prefixed by +.BR 0 ). +The second argument specifies the length of data to change, that is four bytes +.RB ( u32 ), +two bytes +.RB ( u16 ) +or a single byte +.RB ( u8 ). +.TP +.BI at " AT " offmask " MASK " shift " SHIFT" +This is an optional part of +.IR RAW_OP +which allows one to have a variable +.I OFFSET +depending on packet data at offset +.IR AT , +which is binary ANDed with +.I MASK +and right-shifted by +.I SHIFT +before adding it to +.IR OFFSET . +.TP +.BI eth " ETHHDR_FIELD" +Change an ETH header field. The supported keywords for +.I ETHHDR_FIELD +are: +.RS +.TP +.B src +.TQ +.B dst +Source or destination MAC address in the standard format: XX:XX:XX:XX:XX:XX +.TP +.B type +Ether-type in numeric value +.RE +.TP +.BI ip " IPHDR_FIELD" +Change an IPv4 header field. The supported keywords for +.I IPHDR_FIELD +are: +.RS +.TP +.B src +.TQ +.B dst +Source or destination IP address, a four-byte value. +.TP +.B tos +.TQ +.B dsfield +.TQ +.B precedence +Type Of Service field, an eight-bit value. +.TP +.B ihl +Change the IP Header Length field, a four-bit value. +.TP +.B protocol +Next-layer Protocol field, an eight-bit value. +.TP +.B nofrag +.TQ +.B firstfrag +.TQ +.B ce +.TQ +.B df +.TQ +.B mf +Change IP header flags. Note that the value to pass to the +.B set +command is not just a bit value, but the full byte including the flags field. +Though only the relevant bits of that value are respected, the rest ignored. +.RE +.TP +.BI ip " BEYOND_IPHDR_FIELD" +Supported only for non-extended layered op. It is passed to the kernel as +offsets relative to the beginning of the IP header and assumes the IP header is +of minimum size (20 bytes). The supported keywords for +.I BEYOND_IPHDR_FIELD +are: +.RS +.TP +.B dport +.TQ +.B sport +Destination or source port numbers, a 16-bit value. Indeed, IPv4 headers don't +contain this information. Instead, this will set an offset which suits at least +TCP and UDP if the IP header is of minimum size (20 bytes). If not, this will do +unexpected things. +.TP +.B icmp_type +.TQ +.B icmp_code +Again, this allows one to change data past the actual IP header itself. It assumes +an ICMP header is present immediately following the (minimal sized) IP header. +If it is not or the latter is bigger than the minimum of 20 bytes, this will do +unexpected things. These fields are eight-bit values. +.RE +.TP +.BI ip " EX_IPHDR_FIELD" +Supported only when +.I ex +is used. The supported keywords for +.I EX_IPHDR_FIELD +are: +.RS +.TP +.B ttl +.RE +.TP +.BI ip6 " IP6HDR_FIELD" +The supported keywords for +.I IP6HDR_FIELD +are: +.RS +.TP +.B src +.TQ +.B dst +.TQ +.B traffic_class +.TQ +.B flow_lbl +.TQ +.B payload_len +.TQ +.B nexthdr +.TQ +.B hoplimit +.RE +.TP +.BI tcp " TCPHDR_FIELD" +The supported keywords for +.I TCPHDR_FIELD +are: +.RS +.TP +.B sport +.TQ +.B dport +Source or destination TCP port number, a 16-bit value. +.TP +.B flags +.RE +.TP +.BI udp " UDPHDR_FIELD" +The supported keywords for +.I UDPHDR_FIELD +are: +.RS +.TP +.B sport +.TQ +.B dport +Source or destination TCP port number, a 16-bit value. +.RE +.TP +.B clear +Clear the addressed data (i.e., set it to zero). +.TP +.B invert +Swap every bit in the addressed data. +.TP +.BI set " VAL" +Set the addressed data to a specific value. The size of +.I VAL +is defined by either one of the +.BR u32 ", " u16 " or " u8 +keywords in +.IR RAW_OP , +or the size of the addressed header field in +.IR LAYERED_OP . +.TP +.BI add " VAL" +Add the addressed data by a specific value. The size of +.I VAL +is defined by the size of the addressed header field in +.IR EXTENDED_LAYERED_OP . +This operation is supported only for extended layered op. +.TP +.BI decrement +Decrease the addressed data by one. +This operation is supported only for +.BR ip " " ttl " and " ip6 " " hoplimit "." +.TP +.B preserve +Keep the addressed data as is. +.TP +.BI retain " RVAL" +This optional extra part of +.I CMD_SPEC +allows one to exclude bits from being changed. Supported only for 32 bits fields +or smaller. +.TP +.I CONTROL +The following keywords allow one to control how the tree of qdisc, classes, +filters and actions is further traversed after this action. +.RS +.TP +.B reclassify +Restart with the first filter in the current list. +.TP +.B pipe +Continue with the next action attached to the same filter. +.TP +.B drop +.TQ +.B shot +Drop the packet. +.TP +.B continue +Continue classification with the next filter in line. +.TP +.B pass +Finish classification process and return to calling qdisc for further packet +processing. This is the default. +.RE +.SH EXAMPLES +Being able to edit packet data, one could do all kinds of things, such as e.g. +implementing port redirection. Certainly not the most useful application, but +as an example it should do: + +First, qdiscs need to be set up to attach filters to. For the receive path, a simple +.B ingress +qdisc will do, for transmit path a classful qdisc +.RB ( HTB +in this case) is necessary: + +.RS +.EX +tc qdisc replace dev eth0 root handle 1: htb +tc qdisc add dev eth0 ingress handle ffff: +.EE +.RE + +Finally, a filter with +.B pedit +action can be added for each direction. In this case, +.B u32 +is used matching on the port number to redirect from, while +.B pedit +then does the actual rewriting: + +.RS +.EX +tc filter add dev eth0 parent 1: u32 \\ + match ip dport 23 0xffff \\ + action pedit pedit munge ip dport set 22 +tc filter add dev eth0 parent ffff: u32 \\ + match ip sport 22 0xffff \\ + action pedit pedit munge ip sport set 23 +tc filter add dev eth0 parent ffff: u32 \\ + match ip sport 22 0xffff \\ + action pedit ex munge ip dst set 192.168.1.199 +tc filter add dev eth0 parent ffff: u32 \\ + match ip sport 22 0xffff \\ + action pedit ex munge ip6 dst set fe80::dacb:8aff:fec7:320e +tc filter add dev eth0 parent ffff: u32 \\ + match ip sport 22 0xffff \\ + action pedit ex munge eth dst set 11:22:33:44:55:66 +tc filter add dev eth0 parent ffff: u32 \\ + match ip dport 23 0xffff \\ + action pedit ex munge tcp dport set 22 +.EE +.RE + +To rewrite just part of a field, use the +.B retain +directive. E.g. to overwrite the DSCP part of a dsfield with $DSCP, without +touching ECN: + +.RS +.EX +tc filter add dev eth0 ingress flower ... \\ + action pedit ex munge ip dsfield set $((DSCP << 2)) retain 0xfc +.EE +.RE + +And vice versa, to set ECN to e.g. 1 without impacting DSCP: + +.RS +.EX +tc filter add dev eth0 ingress flower ... \\ + action pedit ex munge ip dsfield set 1 retain 0x3 +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-htb (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-pfifo.8 b/man/man8/tc-pfifo.8 new file mode 100644 index 0000000..ed23850 --- /dev/null +++ b/man/man8/tc-pfifo.8 @@ -0,0 +1 @@ +.so man8/tc-bfifo.8 diff --git a/man/man8/tc-pfifo_fast.8 b/man/man8/tc-pfifo_fast.8 new file mode 100644 index 0000000..0029d67 --- /dev/null +++ b/man/man8/tc-pfifo_fast.8 @@ -0,0 +1,53 @@ +.TH PFIFO_FAST 8 "10 January 2002" "iproute2" "Linux" +.SH NAME +pfifo_fast \- three-band first in, first out queue + +.SH DESCRIPTION +pfifo_fast is the default qdisc of each interface. + +Whenever an interface is created, the pfifo_fast qdisc is automatically used +as a queue. If another qdisc is attached, it preempts the default +pfifo_fast, which automatically returns to function when an existing qdisc +is detached. + +In this sense this qdisc is magic, and unlike other qdiscs. + +.SH ALGORITHM +The algorithm is very similar to that of the classful +.BR tc-prio (8) +qdisc. +.B pfifo_fast +is like three +.BR tc-pfifo (8) +queues side by side, where packets can be enqueued in any of the three bands +based on their Type of Service bits or assigned priority. + +Not all three bands are dequeued simultaneously - as long as lower bands +have traffic, higher bands are never dequeued. This can be used to +prioritize interactive traffic or penalize 'lowest cost' traffic. + +Each band can be txqueuelen packets long, as configured with +.BR ip (8). +Additional packets coming in are not enqueued but are instead dropped. + +See +.BR tc-prio (8) +for complete details on how TOS bits are translated into bands. +.SH PARAMETERS +.TP +txqueuelen +The length of the three bands depends on the interface txqueuelen, as +specified with +.BR ip (8). + +.SH BUGS +Does not maintain statistics and does not show up in tc qdisc ls. This is because +it is the automatic default in the absence of a configured qdisc. + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHORS +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru> + +This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8 new file mode 100644 index 0000000..5a8c782 --- /dev/null +++ b/man/man8/tc-pie.8 @@ -0,0 +1,148 @@ +.TH PIE 8 "16 January 2014" "iproute2" "Linux" +.SH NAME +PIE \- Proportional Integral controller-Enhanced AQM algorithm +.SH SYNOPSIS +.B tc qdisc ... pie +[ +.B limit +PACKETS ] [ +.B target +TIME ] [ +.B tupdate +TIME ] [ +.B alpha +int ] [ +.B beta +int ] [ +.B ecn +| +.B noecn +] [ +.B bytemode +| +.B nobytemode +] [ +.B dq_rate_estimator +| +.B no_dq_rate_estimator +] + +.SH DESCRIPTION +Proportional Integral controller-Enhanced (PIE) is a control theoretic active +queue management scheme. It is based on the proportional integral controller but +aims to control delay. The main design goals are + o Low latency control + o High link utilization + o Simple implementation + o Guaranteed stability and fast responsiveness + +.SH ALGORITHM +PIE is designed to control delay effectively. First, an average dequeue rate is +estimated based on the standing queue. The rate is used to calculate the current +delay. Then, on a periodic basis, the delay is used to calculate the dropping +probability. Finally, on arrival, a packet is dropped (or marked) based on this +probability. + +PIE makes adjustments to the probability based on the trend of the delay i.e. +whether it is going up or down.The delay converges quickly to the target value +specified. + +alpha and beta are statically chosen parameters chosen to control the drop probability +growth and are determined through control theoretic approaches. alpha determines how +the deviation between the current and target latency changes probability. beta exerts +additional adjustments depending on the latency trend. + +The drop probability is used to mark packets in ecn mode. However, as in RED, +beyond 10% packets are dropped based on this probability. The bytemode is used +to drop packets proportional to the packet size. + +Additional details can be found in the paper cited below. + +.SH PARAMETERS +.SS limit +limit on the queue size in packets. Incoming packets are dropped when this limit +is reached. Default is 1000 packets. + +.SS target +is the expected queue delay. The default target delay is 15ms. + +.SS tupdate +is the frequency at which the system drop probability is calculated. The default is 15ms. + +.SS alpha +.SS beta +alpha and beta are parameters chosen to control the drop probability. These +should be in the range between 0 and 32. + +.SS ecn | noecn +is used to mark packets instead of dropping. +.B ecn +to turn on ecn mode, +.B noecn +to turn off ecn mode. By default, +.B ecn +is turned off. + +.SS bytemode | nobytemode +is used to scale drop probability proportional to packet size. +.B bytemode +to turn on bytemode, +.B nobytemode +to turn off bytemode. By default, +.B bytemode +is turned off. + +.SS dq_rate_estimator | no_dq_rate_estimator +is used to calculate delay using Little's law. +.B dq_rate_estimator +to turn on dq_rate_estimator, +.B no_dq_rate_estimator +to turn off no_dq_rate_estimator. By default, +.B dq_rate_estimator +is turned off. + +.SH EXAMPLES + # tc qdisc add dev eth0 root pie + # tc -s qdisc show + qdisc pie 8036: dev eth0 root refcnt 2 limit 1000p target 15.0ms tupdate 16.0ms alpha 2 beta 20 + Sent 31216108 bytes 20800 pkt (dropped 80, overlimits 0 requeues 0) + backlog 16654b 11p requeues 0 + prob 0.006161 delay 15666us + pkts_in 20811 overlimit 0 dropped 80 maxq 50 ecn_mark 0 + + # tc qdisc add dev eth0 root pie dq_rate_estimator + # tc -s qdisc show + qdisc pie 8036: dev eth0 root refcnt 2 limit 1000p target 15.0ms tupdate 16.0ms alpha 2 beta 20 + Sent 63947420 bytes 42414 pkt (dropped 41, overlimits 0 requeues 0) + backlog 271006b 179p requeues 0 + prob 0.000092 delay 22200us avg_dq_rate 12145996 + pkts_in 41 overlimit 343 dropped 0 maxq 50 ecn_mark 0 + + # tc qdisc add dev eth0 root pie limit 100 target 20ms tupdate 30ms ecn + # tc -s qdisc show + qdisc pie 8036: dev eth0 root refcnt 2 limit 100p target 20.0ms tupdate 32.0ms alpha 2 beta 20 ecn + Sent 6591724 bytes 4442 pkt (dropped 27, overlimits 0 requeues 0) + backlog 18168b 12p requeues 0 + prob 0.008845 delay 11348us + pkts_in 4454 overlimit 0 dropped 27 maxq 65 ecn_mark 0 + + # tc qdisc add dev eth0 root pie limit 100 target 50ms tupdate 30ms bytemode + # tc -s qdisc show + qdisc pie 8036: dev eth0 root refcnt 2 limit 100p target 50.0ms tupdate 32.0ms alpha 2 beta 20 bytemode + Sent 1616274 bytes 1137 pkt (dropped 0, overlimits 0 requeues 0) + backlog 13626b 9p requeues 0 + prob 0.000000 delay 0us + pkts_in 1146 overlimit 0 dropped 0 maxq 23 ecn_mark 0 + +.SH SEE ALSO +.BR tc (8), +.BR tc-codel (8) +.BR tc-red (8) + +.SH SOURCES + o RFC 8033: https://tools.ietf.org/html/rfc8033 + +.SH AUTHORS +PIE was implemented by Vijay Subramanian and Mythili Prabhu, also the authors of +this man page. Please report bugs and corrections to the Linux networking +development mailing list at <netdev@vger.kernel.org>. diff --git a/man/man8/tc-police.8 b/man/man8/tc-police.8 new file mode 100644 index 0000000..86e263b --- /dev/null +++ b/man/man8/tc-police.8 @@ -0,0 +1,168 @@ +.TH "Policing action in tc" 8 "20 Jan 2015" "iproute2" "Linux" + +.SH NAME +police - policing action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action police [" +.BI rate " RATE " burst +.IR BYTES [\fB/ BYTES "] ] [" +.BI pkts_rate " RATE " pkts_burst +.IR PACKETS "] [" +.B mtu +.IR BYTES [\fB/ BYTES "] ] [" +.BI peakrate " RATE" +] [ +.BI overhead " BYTES" +] [ +.BI linklayer " TYPE" +] [ +.IR CONTROL " ]" + +.ti -8 +.BR tc " ... " filter " ... [ " estimator +.IR "SAMPLE AVERAGE " ] +.BR "action police avrate" +.IR RATE " [ " CONTROL " ]" + +.ti -8 +.IR CONTROL " :=" +.BI conform-exceed " EXCEEDACT\fR[\fB/\fINOTEXCEEDACT" + +.ti -8 +.IR EXCEEDACT/NOTEXCEEDACT " := { " +.BR pipe " | " ok " | " reclassify " | " drop " | " continue " | " goto " " chain " " CHAIN_INDEX " }" +.SH DESCRIPTION +The +.B police +action allows limiting of the byte or packet rate of traffic matched by the +filter it is attached to. +.P +There are two different algorithms available to measure the byte rate: The +first one uses an internal dual token bucket and is configured using the +.BR rate ", " burst ", " mtu ", " peakrate ", " overhead " and " linklayer +parameters. The second one uses an in-kernel sampling mechanism. It can be +fine-tuned using the +.B estimator +filter parameter. +.P +There is one algorithm available to measure packet rate and it is similar to +the first algorithm described for byte rate. It is configured using the +.BR pkt_rate " and " pkt_burst +parameters. +.P +At least one of the +.BR rate " and " pkt_rate " +parameters must be configured. +.SH OPTIONS +.TP +.BI rate " RATE" +The maximum byte rate of packets passing this action. Those exceeding it will +be treated as defined by the +.B conform-exceed +option. +.TP +.BI burst " BYTES\fR[\fB/\fIBYTES\fR]" +Set the maximum allowed burst in bytes, optionally followed by a slash ('/') +sign and cell size which must be a power of 2. +.TP +.BI pkt_rate " RATE" +The maximum packet rate or packets passing this action. Those exceeding it will +be treated as defined by the +.B conform-exceed +option. +.TP +.BI pkt_burst " PACKETS" +Set the maximum allowed burst in packets. +.TP +.BI mtu " BYTES\fR[\fB/\fIBYTES\fR]" +This is the maximum packet size handled by the policer (larger ones will be +handled like they exceeded the configured rate). Setting this value correctly +will improve the scheduler's precision. +Value formatting is identical to +.B burst +above. Defaults to unlimited. +.TP +.BI peakrate " RATE" +Set the maximum bucket depletion rate, exceeding +.BR rate . +.TP +.BI avrate " RATE" +Make use of an in-kernel bandwidth rate estimator and match the given +.I RATE +against it. +.TP +.BI overhead " BYTES" +Account for protocol overhead of encapsulating output devices when computing +.BR rate " and " peakrate . +.TP +.BI linklayer " TYPE" +Specify the link layer type. +.I TYPE +may be one of +.B ethernet +(the default), +.BR atm " or " adsl +(which are synonyms). It is used to align the precomputed rate tables to ATM +cell sizes, for +.B ethernet +no action is taken. +.TP +.BI estimator " SAMPLE AVERAGE" +Fine-tune the in-kernel packet rate estimator. +.IR SAMPLE " and " AVERAGE +are time values and control the frequency in which samples are taken and over +what timespan an average is built. +.TP +.BI conform-exceed " EXCEEDACT\fR[\fB/\fINOTEXCEEDACT\fR]" +Define how to handle packets which exceed or conform the +configured bandwidth limit. Possible values are: +.RS +.IP continue +Don't do anything, just continue with the next action in line. +.IP drop +Drop the packet immediately. +.IP shot +This is a synonym to +.BR drop . +.IP ok +Accept the packet. This is the default for conforming packets. +.IP pass +This is a synonym to +.BR ok . +.IP reclassify +Treat the packet as non-matching to the filter this action is attached to and +continue with the next filter in line (if any). This is the default for +exceeding packets. +.IP pipe +Pass the packet to the next action in line. +.RE +.SH EXAMPLES +A typical application of the police action is to enforce ingress traffic rate +by dropping exceeding packets. Although better done on the sender's side, +especially in scenarios with lack of peer control (e.g. with dial-up providers) +this is often the best one can do in order to keep latencies low under high +load. The following establishes input bandwidth policing to 1mbit/s using the +.B ingress +qdisc and +.B u32 +filter: + +.RS +.EX +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + police rate 1mbit burst 100k +.EE +.RE + +As an action can not live on it's own, there always has to be a filter involved as link between qdisc and action. The example above uses +.B u32 +for that, which is configured to effectively match any packet (passing it to the +.B police +action thereby). + +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-prio.8 b/man/man8/tc-prio.8 new file mode 100644 index 0000000..605f3d3 --- /dev/null +++ b/man/man8/tc-prio.8 @@ -0,0 +1,185 @@ +.TH PRIO 8 "16 December 2001" "iproute2" "Linux" +.SH NAME +PRIO \- Priority qdisc +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B ( parent +classid +.B | root) [ handle +major: +.B ] prio [ bands +bands +.B ] [ priomap +band band band... +.B ] [ estimator +interval timeconstant +.B ] + +.SH DESCRIPTION +The PRIO qdisc is a simple classful queueing discipline that contains +an arbitrary number of classes of differing priority. The classes are +dequeued in numerical descending order of priority. PRIO is a scheduler +and never delays packets - it is a work-conserving qdisc, though the qdiscs +contained in the classes may not be. + +Very useful for lowering latency when there is no need for slowing down +traffic. + +.SH ALGORITHM +On creation with 'tc qdisc add', a fixed number of bands is created. Each +band is a class, although is not possible to add classes with 'tc qdisc +add', the number of bands to be created must instead be specified on the +command line attaching PRIO to its root. + +When dequeueing, band 0 is tried first and only if it did not deliver a +packet does PRIO try band 1, and so onwards. Maximum reliability packets +should therefore go to band 0, minimum delay to band 1 and the rest to band +2. + +As the PRIO qdisc itself will have minor number 0, band 0 is actually +major:1, band 1 is major:2, etc. For major, substitute the major number +assigned to the qdisc on 'tc qdisc add' with the +.B handle +parameter. + +.SH CLASSIFICATION +Three methods are available to PRIO to determine in which band a packet will +be enqueued. +.TP +From userspace +A process with sufficient privileges can encode the destination class +directly with SO_PRIORITY, see +.BR socket(7). +.TP +with a tc filter +A tc filter attached to the root qdisc can point traffic directly to a class +.TP +with the priomap +Based on the packet priority, which in turn is derived from the Type of +Service assigned to the packet. +.P +Only the priomap is specific to this qdisc. +.SH QDISC PARAMETERS +.TP +bands +Number of bands. If changed from the default of 3, +.B priomap +must be updated as well. +.TP +priomap +The priomap maps the priority of +a packet to a class. The priority can either be set directly from userspace, +or be derived from the Type of Service of the packet. + +Determines how packet priorities, as assigned by the kernel, map to +bands. Mapping occurs based on the TOS octet of the packet, which looks like +this: + +.nf +0 1 2 3 4 5 6 7 ++---+---+---+---+---+---+---+---+ +| | | | +|PRECEDENCE | TOS |MBZ| +| | | | ++---+---+---+---+---+---+---+---+ +.fi + +The four TOS bits (the 'TOS field') are defined as: + +.nf +Binary Decimal Meaning +----------------------------------------- +1000 8 Minimize delay (md) +0100 4 Maximize throughput (mt) +0010 2 Maximize reliability (mr) +0001 1 Minimize monetary cost (mmc) +0000 0 Normal Service +.fi + +As there is 1 bit to the right of these four bits, the actual value of the +TOS field is double the value of the TOS bits. Tcpdump -v -v shows you the +value of the entire TOS field, not just the four bits. It is the value you +see in the first column of this table: + +.nf +TOS Bits Means Linux Priority Band +------------------------------------------------------------ +0x0 0 Normal Service 0 Best Effort 1 +0x2 1 Minimize Monetary Cost 0 Best Effort 1 +0x4 2 Maximize Reliability 0 Best Effort 1 +0x6 3 mmc+mr 0 Best Effort 1 +0x8 4 Maximize Throughput 2 Bulk 2 +0xa 5 mmc+mt 2 Bulk 2 +0xc 6 mr+mt 2 Bulk 2 +0xe 7 mmc+mr+mt 2 Bulk 2 +0x10 8 Minimize Delay 6 Interactive 0 +0x12 9 mmc+md 6 Interactive 0 +0x14 10 mr+md 6 Interactive 0 +0x16 11 mmc+mr+md 6 Interactive 0 +0x18 12 mt+md 4 Int. Bulk 1 +0x1a 13 mmc+mt+md 4 Int. Bulk 1 +0x1c 14 mr+mt+md 4 Int. Bulk 1 +0x1e 15 mmc+mr+mt+md 4 Int. Bulk 1 +.fi + +The second column contains the value of the relevant +four TOS bits, followed by their translated meaning. For example, 15 stands +for a packet wanting Minimal Monetary Cost, Maximum Reliability, Maximum +Throughput AND Minimum Delay. + +The fourth column lists the way the Linux kernel interprets the TOS bits, by +showing to which Priority they are mapped. + +The last column shows the result of the default priomap. On the command line, +the default priomap looks like this: + + 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 + +This means that priority 4, for example, gets mapped to band number 1. +The priomap also allows you to list higher priorities (> 7) which do not +correspond to TOS mappings, but which are set by other means. + +This table from RFC 1349 (read it for more details) explains how +applications might very well set their TOS bits: + +.nf +TELNET 1000 (minimize delay) +FTP + Control 1000 (minimize delay) + Data 0100 (maximize throughput) + +TFTP 1000 (minimize delay) + +SMTP + Command phase 1000 (minimize delay) + DATA phase 0100 (maximize throughput) + +Domain Name Service + UDP Query 1000 (minimize delay) + TCP Query 0000 + Zone Transfer 0100 (maximize throughput) + +NNTP 0001 (minimize monetary cost) + +ICMP + Errors 0000 + Requests 0000 (mostly) + Responses <same as request> (mostly) +.fi + + +.SH CLASSES +PRIO classes cannot be configured further - they are automatically created +when the PRIO qdisc is attached. Each class however can contain yet a +further qdisc. + +.SH BUGS +Large amounts of traffic in the lower bands can cause starvation of higher +bands. Can be prevented by attaching a shaper (for example, +.BR tc-tbf(8) +to these bands to make sure they cannot dominate the link. + +.SH AUTHORS +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru>, J Hadi Salim +<hadi@cyberus.ca>. This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-red.8 b/man/man8/tc-red.8 new file mode 100644 index 0000000..662e4d8 --- /dev/null +++ b/man/man8/tc-red.8 @@ -0,0 +1,180 @@ +.TH RED 8 "13 December 2001" "iproute2" "Linux" +.SH NAME +red \- Random Early Detection +.SH SYNOPSIS +.B tc qdisc ... red +.B limit +bytes +.B [ min +bytes +.B ] [ max +bytes +.B ] avpkt +bytes +.B [ burst +packets +.B ] [ ecn ] [ harddrop ] [ nodrop ] [ bandwidth +rate +.B ] [ probability +chance +.B ] [ adaptive ] [ qevent early_drop block +index +.B ] [ qevent mark block +index +.B ] + +.SH DESCRIPTION +Random Early Detection is a classless qdisc which manages its queue size +smartly. Regular queues simply drop packets from the tail when they are +full, which may not be the optimal behaviour. RED also performs tail drop, +but does so in a more gradual way. + +Once the queue hits a certain average length, packets enqueued have a +configurable chance of being marked (which may mean dropped). This chance +increases linearly up to a point called the +.B max +average queue length, although the queue might get bigger. + +This has a host of benefits over simple taildrop, while not being processor +intensive. It prevents synchronous retransmits after a burst in traffic, +which cause further retransmits, etc. + +The goal is to have a small queue size, which is good for interactivity +while not disturbing TCP/IP traffic with too many sudden drops after a burst +of traffic. + +Depending on if ECN is configured, marking either means dropping or +purely marking a packet as overlimit. +.SH ALGORITHM +The average queue size is used for determining the marking +probability. This is calculated using an Exponential Weighted Moving +Average, which can be more or less sensitive to bursts. + +When the average queue size is below +.B min +bytes, no packet will ever be marked. When it exceeds +.B min, +the probability of doing so climbs linearly up +to +.B probability, +until the average queue size hits +.B max +bytes. Because +.B probability +is normally not set to 100%, the queue size might +conceivably rise above +.B max +bytes, so the +.B limit +parameter is provided to set a hard maximum for the size of the queue. + +.SH PARAMETERS +.TP +min +Average queue size at which marking becomes a possibility. Defaults to +.B max +/3 + +.TP +max +At this average queue size, the marking probability is maximal. Should be at +least twice +.B min +to prevent synchronous retransmits, higher for low +.B min. +Default to +.B limit +/4 +.TP +probability +Maximum probability for marking, specified as a floating point +number from 0.0 to 1.0. Suggested values are 0.01 or 0.02 (1 or 2%, +respectively). Default : 0.02 +.TP +limit +Hard limit on the real (not average) queue size in bytes. Further packets +are dropped. Should be set higher than max+burst. It is advised to set this +a few times higher than +.B max. +.TP +burst +Used for determining how fast the average queue size is influenced by the +real queue size. Larger values make the calculation more sluggish, allowing +longer bursts of traffic before marking starts. Real life experiments +support the following guideline: (min+min+max)/(3*avpkt). +.TP +avpkt +Specified in bytes. Used with burst to determine the time constant for +average queue size calculations. 1000 is a good value. +.TP +bandwidth +This rate is used for calculating the average queue size after some +idle time. Should be set to the bandwidth of your interface. Does not mean +that RED will shape for you! Optional. Default : 10Mbit +.TP +ecn +As mentioned before, RED can either 'mark' or 'drop'. Explicit Congestion +Notification allows RED to notify remote hosts that their rate exceeds the +amount of bandwidth available. Non-ECN capable hosts can only be notified by +dropping a packet. If this parameter is specified, packets which indicate +that their hosts honor ECN will only be marked and not dropped, unless the +queue size hits +.B limit +bytes. Recommended. +.TP +harddrop +If average flow queue size is above +.B max +bytes, this parameter forces a drop instead of ecn marking. +.TP +nodrop +With this parameter, traffic that should be marked, but is not ECN-capable, is +enqueued. Without the parameter it is early-dropped. +.TP +adaptive +(Added in linux-3.3) Sets RED in adaptive mode as described in http://icir.org/floyd/papers/adaptiveRed.pdf +.nf +Goal of Adaptive RED is to make 'probability' dynamic value between 1% and 50% to reach the target average queue : +.B (max - min) / 2 +.fi + +.SH QEVENTS +See tc (8) for some general notes about qevents. The RED qdisc supports the +following qevents: + +.TP +early_drop +The associated block is executed when packets are early-dropped. This includes +non-ECT packets in ECN mode. +.TP +mark +The associated block is executed when packets are marked in ECN mode. + +.SH EXAMPLE + +.P +# tc qdisc add dev eth0 parent 1:1 handle 10: red + limit 400000 min 30000 max 90000 avpkt 1000 + burst 55 ecn adaptive bandwidth 10Mbit + +.SH SEE ALSO +.BR tc (8), +.BR tc-choke (8) + +.SH SOURCES +.TP +o +Floyd, S., and Jacobson, V., Random Early Detection gateways for +Congestion Avoidance. http://www.aciri.org/floyd/papers/red/red.html +.TP +o +Some changes to the algorithm by Alexey N. Kuznetsov. +.TP +o +Adaptive RED : http://icir.org/floyd/papers/adaptiveRed.pdf + +.SH AUTHORS +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru>, Alexey Makarenko +<makar@phoenix.kharkov.ua>, J Hadi Salim <hadi@nortelnetworks.com>, +Eric Dumazet <eric.dumazet@gmail.com>. +This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-route.8 b/man/man8/tc-route.8 new file mode 100644 index 0000000..b865cd1 --- /dev/null +++ b/man/man8/tc-route.8 @@ -0,0 +1,74 @@ +.TH "Route classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +route \- route traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " route " [ " from +.IR REALM " | " +.B fromif +.IR TAG " ] [ " +.B to +.IR REALM " ] [ " +.B classid +.IR CLASSID " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +Match packets based on routing table entries. This filter centers around the +possibility to assign a +.B realm +to routing table entries. For any packet to be classified by this filter, a +routing table lookup is performed and the returned +.B realm +is used to decide on whether the packet is a match or not. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI from " REALM" +.TQ +.BI fromif " TAG" +Perform source route lookups. +.I TAG +is the name of an interface which must be present on the system at the time of +.B tc +invocation. +.TP +.BI to " REALM" +Match if normal (i.e., destination) routing returns the given +.IR REALM . +.SH EXAMPLES +Consider the subnet 192.168.2.0/24 being attached to eth0: + +.RS +.EX +ip route add 192.168.2.0/24 dev eth0 realm 2 +.EE +.RE + +The following +.B route +filter will then match packets from that subnet: + +.RS +.EX +tc filter add ... route from 2 classid 1:2 +.EE +.RE + +and pass packets on to class 1:2. +.SH NOTES +Due to implementation details, +.B realm +values must be in a range from 0 to 255, inclusive. Alternatively, a verbose +name defined in /etc/iproute2/rt_realms may be given instead. +.SH SEE ALSO +.BR tc (8), +.BR ip-route (8) diff --git a/man/man8/tc-sample.8 b/man/man8/tc-sample.8 new file mode 100644 index 0000000..44fc262 --- /dev/null +++ b/man/man8/tc-sample.8 @@ -0,0 +1,122 @@ +.TH "Packet sample action in tc" 8 "31 Jan 2017" "iproute2" "Linux" + +.SH NAME +sample - packet sampling tc action +.SH SYNOPSIS +.in +8 +.ti -8 + +.BR tc " ... " "action sample rate" +.I RATE +.BR "group" +.I GROUP +.RB "[ " trunc +.IR SIZE " ] " +.RB "[ " index +.IR INDEX " ] " +.ti -8 + +.BR tc " ... " "action sample index " +.I INDEX +.ti -8 + +.SH DESCRIPTION +The +.B sample +action allows sampling packets matching classifier. + +The packets are chosen randomly according to the +.B rate +parameter, and are sampled using the +.B psample +generic netlink channel. The user can also specify packet truncation to save +user-kernel traffic. Each sample includes some informative metadata about the +original packet, which is sent using netlink attributes, alongside the original +packet data. + +The user can either specify the sample action parameters as presented in the +first form above, or use an existing sample action using its index, as presented +in the second form. + +.SH SAMPLED PACKETS METADATA FIELDS +The metadata are delivered to userspace applications using the +.B psample +generic netlink channel, where each sample includes the following netlink +attributes: +.TP +.BI PSAMPLE_ATTR_IIFINDEX +The input interface index of the packet, if there is one. +.TP +.BI PSAMPLE_ATTR_OIFINDEX +The output interface index of the packet. This field is not relevant on ingress +sampling +.TP +.BI PSAMPLE_ATTR_ORIGSIZE +The size of the original packet (before truncation) +.TP +.BI PSAMPLE_ATTR_SAMPLE_GROUP +The +.B psample +group the packet was sent to +.TP +.BI PSAMPLE_ATTR_GROUP_SEQ +A sequence number of the sampled packet. This number is incremented with each +sampled packet of the current +.B psample +group +.TP +.BI PSAMPLE_ATTR_SAMPLE_RATE +The rate the packet was sampled with + +.SH OPTIONS +.TP +.BI rate " RATE" +The packet sample rate. +.I "RATE" +is the expected ratio between observed packets and sampled packets. For example, +.I "RATE" +of 100 will lead to an average of one sampled packet out of every 100 observed. +.TP +.BI trunc " SIZE" +Upon set, defines the maximum size of the sampled packets, and causes truncation +if needed +.TP +.BI group " GROUP" +The +.B psample +group the packet will be sent to. The +.B psample +module defines the concept of groups, which allows the user to match specific +sampled packets in the case of multiple sampling rules, thus identify only the +packets that came from a specific rule. +.TP +.BI index " INDEX" +Is a unique ID for an action. When creating new action instance, this parameter +allows one to set the new action index. When using existing action, this parameter +allows one to specify the existing action index. The index must 32bit unsigned +integer greater than zero. +.SH EXAMPLES +Sample one of every 100 packets flowing into interface eth0 to psample group 12: + +.RS +.EX +tc qdisc add dev eth0 handle ffff: ingress +tc filter add dev eth0 parent ffff: matchall \\ + action sample rate 100 group 12 index 19 +.EE +.RE + +Use the same action instance to sample eth1 too: + +.RS +.EX +tc qdisc add dev eth1 handle ffff: ingress +tc filter add dev eth1 parent ffff: matchall \\ + action sample index 19 +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-matchall (8) +.BR psample (1) diff --git a/man/man8/tc-sfb.8 b/man/man8/tc-sfb.8 new file mode 100644 index 0000000..1f2b8c5 --- /dev/null +++ b/man/man8/tc-sfb.8 @@ -0,0 +1,213 @@ +.TH SFB 8 "August 2011" "iproute2" "Linux" +.SH NAME +sfb \- Stochastic Fair Blue +.SH SYNOPSIS +.B tc qdisc ... blue +.B rehash +milliseconds +.B db +milliseconds +.B limit +packets +.B max +packets +.B target +packets +.B increment +float +.B decrement +float +.B penalty_rate +packets per second +.B penalty_burst +packets + +.SH DESCRIPTION +Stochastic Fair Blue is a classless qdisc to manage congestion based on +packet loss and link utilization history while trying to prevent +non-responsive flows (i.e. flows that do not react to congestion marking +or dropped packets) from impacting performance of responsive flows. +Unlike RED, where the marking probability has to be configured, BLUE +tries to determine the ideal marking probability automatically. + +.SH ALGORITHM + +The +.B BLUE +algorithm maintains a probability which is used to mark or drop packets +that are to be queued. If the queue overflows, the mark/drop probability +is increased. If the queue becomes empty, the probability is decreased. The +.B Stochastic Fair Blue +(SFB) algorithm is designed to protect TCP flows against non-responsive flows. + +This SFB implementation maintains 8 levels of 16 bins each for accounting. +Each flow is mapped into a bin of each level using a per-level hash value. + +Every bin maintains a marking probability, which gets increased or decreased +based on bin occupancy. If the number of packets exceeds the size of that +bin, the marking probability is increased. If the number drops to zero, it +is decreased. + +The marking probability is based on the minimum value of all bins a flow is +mapped into, thus, when a flow does not respond to marking or gradual packet +drops, the marking probability quickly reaches one. + +In this case, the flow is rate-limited to +.B penalty_rate +packets per second. + +.SH LIMITATIONS + +Due to SFBs nature, it is possible for responsive flows to share all of its bins +with a non-responsive flow, causing the responsive flow to be misidentified as +being non-responsive. + +The probability of a responsive flow to be misidentified is dependent on +the number of non-responsive flows, M. It is (1 - (1 - (1 / 16.0)) ** M) **8, +so for example with 10 non-responsive flows approximately 0.2% of responsive flows +will be misidentified. + +To mitigate this, SFB performs periodic re-hashing to avoid +misclassification for prolonged periods of time. + +The default hashing method will use source and destination ip addresses and port numbers +if possible, and also supports tunneling protocols. +Alternatively, an external classifier can be configured, too. + +.SH PARAMETERS +.TP +rehash +Time interval in milliseconds when queue perturbation occurs to avoid erroneously +detecting unrelated, responsive flows as being part of a non-responsive flow for +prolonged periods of time. +Defaults to 10 minutes. +.TP +db +Double buffering warmup wait time, in milliseconds. +To avoid destroying the probability history when rehashing is performed, this +implementation maintains a second set of levels/bins as described in section +4.4 of the SFB reference. +While one set is used to manage the queue, a second set is warmed up: +Whenever a flow is then determined to be non-responsive, the marking +probabilities in the second set are updated. When the rehashing +happens, these bins will be used to manage the queue and all non-responsive +flows can be rate-limited immediately. +This value determines how much time has to pass before the 2nd set +will start to be warmed up. +Defaults to one minute, should be lower than +.B +rehash. +.TP +limit +Hard limit on the real (not average) total queue size in packets. +Further packets are dropped. Defaults to the transmit queue length of the +device the qdisc is attached to. +.TP +max +Maximum length of a buckets queue, in packets, before packets start being +dropped. Should be slightly larger than +.B target +, but should not be set to values exceeding 1.5 times that of +.B target . +Defaults to 25. +.TP +target +The desired average bin length. If the bin queue length reaches this value, +the marking probability is increased by +.B increment. +The default value depends on the +.B max +setting, with max set to 25 +.B target +will default to 20. +.TP +increment +A value used to increase the marking probability when the queue appears +to be over-used. Must be between 0 and 1.0. Defaults to 0.00050. +.TP +decrement +Value used to decrease the marking probability when the queue is found +to be empty. Must be between 0 and 1.0. +Defaults to 0.00005. +.TP +penalty_rate +The maximum number of packets belonging to flows identified as being +non-responsive that can be enqueued per second. Once this number has been +reached, further packets of such non-responsive flows are dropped. +Set this to a reasonable fraction of your uplink throughput; the +default value of 10 packets is probably too small. +.TP +penalty_burst +The number of packets a flow is permitted to exceed the penalty rate before packets +start being dropped. +Defaults to 20 packets. + +.SH STATISTICS + +This qdisc exposes additional statistics via 'tc -s qdisc' output. +These are: +.TP +earlydrop +The number of packets dropped before a per-flow queue was full. +.TP +ratedrop +The number of packets dropped because of rate-limiting. +If this value is high, there are many non-reactive flows being +sent through sfb. In such cases, it might be better to +embed sfb within a classful qdisc to better control such +flows using a different, shaping qdisc. +.TP +bucketdrop +The number of packets dropped because a per-flow queue was full. +High bucketdrop may point to a high number of aggressive, short-lived +flows. +.TP +queuedrop +The number of packets dropped due to reaching limit. This should normally be 0. +.TP +marked +The number of packets marked with ECN. +.TP +maxqlen +The length of the current longest per-flow (virtual) queue. +.TP +maxprob +The maximum per-flow drop probability. 1 means that some +flows have been detected as non-reactive. + +.SH NOTES + +SFB automatically enables use of Explicit Congestion Notification (ECN). +Also, this SFB implementation does not queue packets itself. +Rather, packets are enqueued to the inner qdisc (defaults to pfifo). +Because sfb maintains virtual queue states, the inner qdisc must not +drop a packet previously queued. +Furthermore, if a buckets queue has a very high marking rate, +this implementation will start dropping packets instead of +marking them, as such a situation points to either bad congestion, or an +unresponsive flow. + +.SH EXAMPLE & USAGE + +To attach to interface $DEV, using default options: +.P +# tc qdisc add dev $DEV handle 1: root sfb + +Only use destination ip addresses for assigning packets to bins, perturbing +hash results every 10 minutes: +.P +# tc filter add dev $DEV parent 1: handle 1 flow hash keys dst perturb 600 + +.SH SEE ALSO +.BR tc (8), +.BR tc-red (8), +.BR tc-sfq (8) +.SH SOURCES +.TP +o +W. Feng, D. Kandlur, D. Saha, K. Shin, BLUE: A New Class of Active Queue Management Algorithms, +U. Michigan CSE-TR-387-99, April 1999. + +.SH AUTHORS + +This SFB implementation was contributed by Juliusz Chroboczek and Eric Dumazet. diff --git a/man/man8/tc-sfq.8 b/man/man8/tc-sfq.8 new file mode 100644 index 0000000..ec4d8b8 --- /dev/null +++ b/man/man8/tc-sfq.8 @@ -0,0 +1,222 @@ +.TH TC 8 "24 January 2012" "iproute2" "Linux" +.SH NAME +sfq \- Stochastic Fairness Queueing +.SH SYNOPSIS +.B tc qdisc ... +.B [ divisor +hashtablesize +.B ] [ limit +packets +.B ] [ perturb +seconds +.B ] [ quantum +bytes +.B ] [ flows +number +.B ] [ depth +number +.B ] [ headdrop +.B ] [ redflowlimit +bytes +.B ] [ min +bytes +.B ] [ max +bytes +.B ] [ avpkt +bytes +.B ] [ burst +packets +.B ] [ probability +P +.B ] [ ecn +.B ] [ harddrop ] +.SH DESCRIPTION + +Stochastic Fairness Queueing is a classless queueing discipline available for +traffic control with the +.BR tc (8) +command. + +SFQ does not shape traffic but only schedules the transmission of packets, based on 'flows'. +The goal is to ensure fairness so that each flow is able to send data in turn, thus preventing +any single flow from drowning out the rest. + +This may in fact have some effect in mitigating a Denial of Service attempt. + +SFQ is work-conserving and therefore always delivers a packet if it has one available. +.SH ALGORITHM +On enqueueing, each packet is assigned to a hash bucket, based on the packets hash value. +This hash value is either obtained from an external flow classifier (use +.B +tc filter +to set them), or a default internal classifier if no external classifier has been configured. + +When the internal classifier is used, sfq uses +.TP +(i) +Source address +.TP +(ii) +Destination address +.TP +(iii) +Source and Destination port +.P +If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP. +Packets with other protocols are hashed based on the 32bits representation of their +destination and source. A flow corresponds mostly to a TCP/IP connection. + +Each of these buckets should represent a unique flow. Because multiple flows may +get hashed to the same bucket, sfqs internal hashing algorithm may be perturbed at configurable +intervals so that the unfairness lasts only for a short while. Perturbation may +however cause some inadvertent packet reordering to occur. After linux-3.3, there is +no packet reordering problem, but possible packet drops if rehashing hits one limit +(number of flows or packets per flow) + +When dequeuing, each hashbucket with data is queried in a round robin fashion. + +Before linux-3.3, the compile time maximum length of the SFQ is 128 packets, which can be spread over +at most 128 buckets of 1024 available. In case of overflow, tail-drop is performed +on the fullest bucket, thus maintaining fairness. + +After linux-3.3, maximum length of SFQ is 65535 packets, and divisor limit is 65536. +In case of overflow, tail-drop is performed on the fullest bucket, unless headdrop was requested. + +.SH PARAMETERS +.TP +divisor +Can be used to set a different hash table size, available from kernel 2.6.39 onwards. +The specified divisor must be a power of two and cannot be larger than 65536. +Default value: 1024. +.TP +limit +Upper limit of the SFQ. Can be used to reduce the default length of 127 packets. +After linux-3.3, it can be raised. +.TP +depth +Limit of packets per flow (after linux-3.3). Default to 127 and can be lowered. +.TP +perturb +Interval in seconds for queue algorithm perturbation. Defaults to 0, which means that +no perturbation occurs. Do not set too low for each perturbation may cause some packet +reordering or losses. Advised value: 60 +This value has no effect when external flow classification is used. +Its better to increase divisor value to lower risk of hash collisions. +.TP +quantum +Amount of bytes a flow is allowed to dequeue during a round of the round robin process. +Defaults to the MTU of the interface which is also the advised value and the minimum value. +.TP +flows +After linux-3.3, it is possible to change the default limit of flows. +Default value is 127 +.TP +headdrop +Default SFQ behavior is to perform tail-drop of packets from a flow. +You can ask a headdrop instead, as this is known to provide a better feedback for TCP flows. +.TP +redflowlimit +Configure the optional RED module on top of each SFQ flow. +Random Early Detection principle is to perform packet marks or drops in a probabilistic way. +(man tc-red for details about RED) +.nf +redflowlimit configures the hard limit on the real (not average) queue size per SFQ flow in bytes. +.fi +.TP +min +Average queue size at which marking becomes a possibility. Defaults to +.B max +/3 +.TP +max +At this average queue size, the marking probability is maximal. Defaults to +.B redflowlimit +/4 +.TP +probability +Maximum probability for marking, specified as a floating point number from 0.0 to 1.0. Default value is 0.02 +.TP +avpkt +Specified in bytes. Used with burst to determine the time constant for average queue size calculations. Default value is 1000 +.TP +burst +Used for determining how fast the average queue size is influenced by the real queue size. +.nf +Default value is : +.B (2 * min + max) / (3 * avpkt) +.fi +.TP +ecn +RED can either 'mark' or 'drop'. Explicit Congestion +Notification allows RED to notify remote hosts that their rate exceeds the +amount of bandwidth available. Non-ECN capable hosts can only be notified by +dropping a packet. If this parameter is specified, packets which indicate +that their hosts honor ECN will only be marked and not dropped, unless the +queue size hits +.B depth +packets. +.TP +harddrop +If average flow queue size is above +.B max +bytes, this parameter forces a drop instead of ecn marking. +.SH EXAMPLE & USAGE + +To attach to device ppp0: +.P +# tc qdisc add dev ppp0 root sfq +.P +Please note that SFQ, like all non-shaping (work-conserving) qdiscs, is only useful +if it owns the queue. +This is the case when the link speed equals the actually available bandwidth. This holds +for regular phone modems, ISDN connections and direct non-switched ethernet links. +.P +Most often, cable modems and DSL devices do not fall into this category. The same holds +for when connected to a switch and trying to send data to a congested segment also +connected to the switch. +.P +In this case, the effective queue does not reside within Linux and is therefore not +available for scheduling. +.P +Embed SFQ in a classful qdisc to make sure it owns the queue. + +It is possible to use external classifiers with sfq, for example to hash traffic based only +on source/destination ip addresses: +.P +# tc filter add ... flow hash keys src,dst perturb 30 divisor 1024 +.P +Note that the given divisor should match the one used by sfq. If you have +changed the sfq default of 1024, use the same value for the flow hash filter, too. + +.P +Example of sfq with optional RED mode : +.P +# tc qdisc add dev eth0 parent 1:1 handle 10: sfq limit 3000 flows 512 divisor 16384 + redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn headdrop + +.SH SOURCE +.TP +o +Paul E. McKenney "Stochastic Fairness Queuing", +IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + +.TP +o +Paul E. McKenney "Stochastic Fairness Queuing", +"Interworking: Research and Experience", v.2, 1991, p.113-131. + +.TP +o +See also: +M. Shreedhar and George Varghese "Efficient Fair +Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + +.SH SEE ALSO +.BR tc (8), +.BR tc-red (8) + +.SH AUTHORS +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru>, +Eric Dumazet <eric.dumazet@gmail.com>. +.P +This manpage maintained by bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-simple.8 b/man/man8/tc-simple.8 new file mode 100644 index 0000000..f565755 --- /dev/null +++ b/man/man8/tc-simple.8 @@ -0,0 +1,98 @@ +.TH "Simple action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +simple - basic example action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action simple" +[ +.BI sdata " STRING" +] [ +.BI index " INDEX" +] [ +.I CONTROL +] + +.ti -8 +.IR CONTROL " := {" +.BR reclassify " | " pipe " | " drop " | " continue " | " ok " }" + +.SH DESCRIPTION +This is a pedagogical example rather than an actually useful action. Upon every access, it prints the given +.I STRING +which may be of arbitrary length. +.SH OPTIONS +.TP +.BI sdata " STRING" +The actual string to print. +.TP +.BI index " INDEX" +Optional action index value. +.TP +.I CONTROL +Indicate how +.B tc +should proceed after executing the action. For a description of the possible +.I CONTROL +values, see +.BR tc-actions (8). +.SH EXAMPLES +The following example makes the kernel yell "Incoming ICMP!" every time it sees +an incoming ICMP on eth0. Steps are: +.IP 1) 4 +Add an ingress qdisc point to eth0 +.IP 2) 4 +Start a chain on ingress of eth0 that first matches ICMP then invokes the +simple action to shout. +.IP 3) 4 +display stats and show that no packet has been seen by the action +.IP 4) 4 +Send one ping packet to google (expect to receive a response back) +.IP 5) 4 +grep the logs to see the logged message +.IP 6) 4 +display stats again and observe increment by 1 + +.EX + hadi@noma1:$ tc qdisc add dev eth0 ingress + hadi@noma1:$tc filter add dev eth0 parent ffff: protocol ip prio 5 \\ + u32 match ip protocol 1 0xff flowid 1:1 action simple sdata "Incoming ICMP" + + hadi@noma1:$ sudo tc -s filter ls dev eth0 parent ffff: + filter protocol ip pref 5 u32 + filter protocol ip pref 5 u32 fh 800: ht divisor 1 + filter protocol ip pref 5 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 + match 00010000/00ff0000 at 8 + action order 1: Simple <Incoming ICMP> + index 4 ref 1 bind 1 installed 29 sec used 29 sec + Action statistics: + Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 + + + hadi@noma1$ ping -c 1 www.google.ca + PING www.google.ca (74.125.225.120) 56(84) bytes of data. + 64 bytes from ord08s08-in-f24.1e100.net (74.125.225.120): icmp_req=1 ttl=53 time=31.3 ms + + --- www.google.ca ping statistics --- + 1 packets transmitted, 1 received, 0% packet loss, time 0ms + rtt min/avg/max/mdev = 31.316/31.316/31.316/0.000 ms + + hadi@noma1$ dmesg | grep simple + [135354.473951] simple: Incoming ICMP_1 + + hadi@noma1$ sudo tc/tc -s filter ls dev eth0 parent ffff: + filter protocol ip pref 5 u32 + filter protocol ip pref 5 u32 fh 800: ht divisor 1 + filter protocol ip pref 5 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 + match 00010000/00ff0000 at 8 + action order 1: Simple <Incoming ICMP> + index 4 ref 1 bind 1 installed 206 sec used 67 sec + Action statistics: + Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 +.EE +.SH SEE ALSO +.BR tc (8) +.BR tc-actions (8) diff --git a/man/man8/tc-skbedit.8 b/man/man8/tc-skbedit.8 new file mode 100644 index 0000000..b2f8e75 --- /dev/null +++ b/man/man8/tc-skbedit.8 @@ -0,0 +1,74 @@ +.TH "SKB editing action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +skbedit - SKB editing action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action skbedit " [ " queue_mapping +.IR QUEUE_MAPPING " ] [" +.B priority +.IR PRIORITY " ] [" +.BI mark " MARK\fR[\fB/\fIMASK] ] [" +.B ptype +.IR PTYPE " ] [" +.BR inheritdsfield " ]" +.SH DESCRIPTION +The +.B skbedit +action allows one to change a packet's associated meta data. It complements the +.B pedit +action, which in turn allows one to change parts of the packet data itself. + +The most unique feature of +.B skbedit +is its ability to decide over which queue of an interface with multiple +transmit queues the packet is to be sent out. The number of available transmit +queues is reflected by sysfs entries within +.I /sys/class/net/<interface>/queues +with name +.I tx-N +(where +.I N +is the actual queue number). +.SH OPTIONS +.TP +.BI queue_mapping " QUEUE_MAPPING" +Override the packet's transmit queue. Useful when applied to packets transmitted +over MQ-capable network interfaces. +.I QUEUE_MAPPING +is an unsigned 16bit value in decimal format. +.TP +.BI priority " PRIORITY" +Override the packet classification decision. +.I PRIORITY +is either +.BR root ", " none +or a hexadecimal major class ID optionally followed by a colon +.RB ( : ) +and a hexadecimal minor class ID. +.TP +.BI mark " MARK\fR[\fB/\fIMASK]" +Change the packet's firewall mark value. +.I MARK +is an unsigned 32bit value in automatically detected format (i.e., prefix with +.RB ' 0x ' +for hexadecimal interpretation, etc.). +.I MASK +defines the 32-bit mask selecting bits of mark value. Default is 0xffffffff. +.TP +.BI ptype " PTYPE" +Override the packet's type. Useful for setting packet type to host when +needing to allow ingressing packets with the wrong MAC address but +correct IP address. +.I PTYPE +is one of: host, otherhost, broadcast, multicast +.TP +.BI inheritdsfield +Override the packet classification decision, and any value specified with +.BR priority ", " +using the information stored in the Differentiated Services Field of the +IPv6/IPv4 header (RFC2474). +.SH SEE ALSO +.BR tc (8), +.BR tc-pedit (8) diff --git a/man/man8/tc-skbmod.8 b/man/man8/tc-skbmod.8 new file mode 100644 index 0000000..646a7e6 --- /dev/null +++ b/man/man8/tc-skbmod.8 @@ -0,0 +1,160 @@ +.TH "skbmod action in tc" 8 "21 Sep 2016" "iproute2" "Linux" + +.SH NAME +skbmod - user-friendly packet editor action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action skbmod " "{ " "set " +.IR SETTABLE " | " +.BI swap " SWAPPABLE" +.RB " | " ecn +.RI "} [ " CONTROL " ] [ " +.BI index " INDEX " +] + +.ti -8 +.IR SETTABLE " := " +.RB " [ " dmac +.IR DMAC " ] " +.RB " [ " smac +.IR SMAC " ] " +.RB " [ " etype +.IR ETYPE " ] " + +.ti -8 +.IR SWAPPABLE " := " +.B mac +.ti -8 + +.IR CONTROL " := {" +.BR reclassify " | " pipe " | " drop " | " shot " | " continue " | " pass " }" +.SH DESCRIPTION +The +.B skbmod +action is intended as a usability upgrade to the existing +.B pedit +action. Instead of having to manually edit 8-, 16-, or 32-bit chunks of an +ethernet header, +.B skbmod +allows complete substitution of supported elements. +Action must be one of +.BR set ", " swap " and " ecn "." +.BR set " and " swap +only affect Ethernet packets, while +.B ecn +only affects IP packets. +.SH OPTIONS +.TP +.BI dmac " DMAC" +Change the destination mac to the specified address. +.TP +.BI smac " SMAC" +Change the source mac to the specified address. +.TP +.BI etype " ETYPE" +Change the ethertype to the specified value. +.TP +.BI mac +Used to swap mac addresses. +.TP +.B ecn +Used to mark ECN Capable Transport (ECT) IP packets as Congestion Encountered (CE). +Does not affect Non ECN-Capable Transport (Non-ECT) packets. +.TP +.I CONTROL +The following keywords allow one to control how the tree of qdisc, classes, +filters and actions is further traversed after this action. +.RS +.TP +.B reclassify +Restart with the first filter in the current list. +.TP +.B pipe +Continue with the next action attached to the same filter. +.TP +.B drop +.TQ +.B shot +Drop the packet. +.TP +.B continue +Continue classification with the next filter in line. +.TP +.B pass +Finish classification process and return to calling qdisc for further packet +processing. This is the default. +.RE +.SH EXAMPLES +To start, observe the following filter with a pedit action: + +.RS +.EX +tc filter add dev eth1 parent 1: protocol ip prio 10 \\ + u32 match ip protocol 1 0xff flowid 1:2 \\ + action pedit munge offset -14 u8 set 0x02 \\ + munge offset -13 u8 set 0x15 \\ + munge offset -12 u8 set 0x15 \\ + munge offset -11 u8 set 0x15 \\ + munge offset -10 u16 set 0x1515 \\ + pipe +.EE +.RE + +Using the skbmod action, this command can be simplified to: + +.RS +.EX +tc filter add dev eth1 parent 1: protocol ip prio 10 \\ + u32 match ip protocol 1 0xff flowid 1:2 \\ + action skbmod set dmac 02:15:15:15:15:15 \\ + pipe +.EE +.RE + +Complexity will increase if source mac and ethertype are also being edited +as part of the action. If all three fields are to be changed with skbmod: + +.RS +.EX +tc filter add dev eth5 parent 1: protocol ip prio 10 \\ + u32 match ip protocol 1 0xff flowid 1:2 \\ + action skbmod \\ + set etype 0xBEEF \\ + set dmac 02:12:13:14:15:16 \\ + set smac 02:22:23:24:25:26 +.EE +.RE + +To swap the destination and source mac addresses in the Ethernet header: + +.RS +.EX +tc filter add dev eth3 parent 1: protocol ip prio 10 \\ + u32 match ip protocol 1 0xff flowid 1:2 \\ + action skbmod \\ + swap mac +.EE +.RE + +Finally, to mark the CE codepoint in the IP header for ECN Capable Transport (ECT) packets: + +.RS +.EX +tc filter add dev eth0 parent 1: protocol ip prio 10 \\ + u32 match ip protocol 1 0xff flowid 1:2 \\ + action skbmod \\ + ecn +.EE +.RE + +Only one of +.BR set ", " swap " and " ecn +shall be used in a single command. +Trying to use more than one of them in a single command is considered undefined behavior; pipe +multiple commands together instead. + +.SH SEE ALSO +.BR tc (8), +.BR tc-u32 (8), +.BR tc-pedit (8) diff --git a/man/man8/tc-skbprio.8 b/man/man8/tc-skbprio.8 new file mode 100644 index 0000000..a0a316b --- /dev/null +++ b/man/man8/tc-skbprio.8 @@ -0,0 +1,70 @@ +.TH SKBPRIO 8 "13 August 2018" "iproute2" "Linux" +.SH NAME +skbprio \- SKB Priority Queue + +.SH SYNOPSIS +.B tc qdisc ... add skbprio +.B [ limit +packets +.B ] + +.SH DESCRIPTION +SKB Priority Queue is a queueing discipline intended to prioritize +the most important packets during a denial-of-service ( +.B DoS +) attack. The priority of a packet is given by +.B skb->priority +, where a higher value places the packet closer to the exit of the queue. When +the queue is full, the lowest priority packet in the queue is dropped to make +room for the packet to be added if it has higher priority. If the packet to be +added has lower priority than all packets in the queue, it is dropped. + +Without SKB priority queue, queue length limits must be imposed +on individual sub-queues, and there is no straightforward way to enforce +a global queue length limit across all priorities. SKBprio queue enforces +a global queue length limit while not restricting the lengths of +individual sub-queues. + +While SKB Priority Queue is agnostic to how +.B skb->priority +is assigned. A typical use case is to copy +the 6-bit DS field of IPv4 and IPv6 packets using +.BR tc-skbedit (8). +If +.B skb->priority +is greater or equal to 64, the priority is assumed to be 63. +Priorities less than 64 are taken at face value. + +SKB Priority Queue enables routers to locally decide which +packets to drop under a DoS attack. +Priorities should be assigned to packets such that the higher the priority, +the more expected behavior a source shows. +So sources have an incentive to play by the rules. + +.SH ALGORITHM + +Skbprio maintains 64 lists (priorities go from 0 to 63). +When a packet is enqueued, it gets inserted at the +.B tail +of its priority list. When a packet needs to be sent out to the network, it is +taken from the head of the highest priority list. When the queue is full, +the packet at the tail of the lowest priority list is dropped to serve the +ingress packet - if it is of higher priority, otherwise the ingress packet is +dropped. This algorithm allocates as much bandwidth as possible to high +priority packets, while only servicing low priority packets when +there is enough bandwidth. + +.SH PARAMETERS +.TP +limit +Maximum queue size specified in packets. It defaults to 64. +The range for this parameter is [0, UINT32_MAX]. + +.SH SEE ALSO +.BR tc-prio (8), +.BR tc-skbedit (8) + +.SH AUTHORS +Nishanth Devarajan <devarajn@uci.edu>, Michel Machado <michel@digirati.com.br> + +This manpage maintained by Bert Hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-stab.8 b/man/man8/tc-stab.8 new file mode 100644 index 0000000..03a0659 --- /dev/null +++ b/man/man8/tc-stab.8 @@ -0,0 +1,163 @@ +.TH STAB 8 "31 October 2011" iproute2 Linux +. +.SH NAME +tc\-stab \- Generic size table manipulations +. +.SH SYNOPSIS +.nf +tc qdisc add ... stab +.RS 4 +[ \fBmtu\fR BYTES ] [ \fBtsize\fR SLOTS ] +[ \fBmpu\fR BYTES ] [ \fBoverhead\fR BYTES ] +[ \fBlinklayer\fR { adsl | atm | ethernet } ] ... +.RE +.fi + +.SH OPTIONS +For the description of BYTES \- please refer to the \fBUNITS\fR +section of \fBtc\fR(8). + +.IP \fBmtu\fR 4 +.br +maximum packet size we create size table for, assumed 2048 if not specified explicitly +.IP \fBtsize\fR +.br +required table size, assumed 512 if not specified explicitly +.IP \fBmpu\fR +.br +minimum packet size used in computations +.IP \fBoverhead\fR +.br +per\-packet size overhead (can be negative) used in computations +.IP \fBlinklayer\fR +.br +required linklayer specification. +.PP +. +.SH DESCRIPTION +. +Size tables allow manipulation of packet sizes, as seen by the whole scheduler +framework (of course, the actual packet size remains the same). Adjusted packet +size is calculated only once \- when a qdisc enqueues the packet. Initial root +enqueue initializes it to the real packet's size. + +Each qdisc can use a different size table, but the adjusted size is stored in +an area shared by whole qdisc hierarchy attached to the interface. The effect is +that if you have such a setup, the last qdisc with a stab in a chain "wins". For +example, consider HFSC with simple pfifo attached to one of its leaf classes. +If that pfifo qdisc has stab defined, it will override lengths calculated +during HFSC's enqueue; and in turn, whenever HFSC tries to dequeue a packet, it +will use a potentially invalid size in its calculations. Normal setups will +usually include stab defined only on root qdisc, but further overriding gives +extra flexibility for less usual setups. + +The initial size table is calculated by \fBtc\fR tool using \fBmtu\fR and +\fBtsize\fR parameters. The algorithm sets each slot's size to the smallest +power of 2 value, so the whole \fBmtu\fR is covered by the size table. Neither +\fBtsize\fR, nor \fBmtu\fR have to be power of 2 value, so the size +table will usually support more than is required by \fBmtu\fR. + +For example, with \fBmtu\fR\~=\~1500 and \fBtsize\fR\~=\~128, a table with 128 +slots will be created, where slot 0 will correspond to sizes 0\-16, slot 1 to +17\~\-\~32, \&..., slot 127 to 2033\~\-\~2048. Sizes assigned to each slot +depend on \fBlinklayer\fR parameter. + +Stab calculation is also safe for an unusual case, when a size assigned to a +slot would be larger than 2^16\-1 (you will lose the accuracy though). + +During the kernel part of packet size adjustment, \fBoverhead\fR will be added +to original size, and then slot will be calculated. If the size would cause +overflow, more than 1 slot will be used to get the final size. This of course +will affect accuracy, but it's only a guard against unusual situations. + +Currently there are two methods of creating values stored in the size table \- +ethernet and atm (adsl): + +.IP ethernet 4 +.br +This is basically 1\-1 mapping, so following our example from above +(disregarding \fBmpu\fR for a moment) slot 0 would have 8, slot 1 would have 16 +and so on, up to slot 127 with 2048. Note, that \fBmpu\fR\~>\~0 must be +specified, and slots that would get less than specified by \fBmpu\fR will get +\fBmpu\fR instead. If you don't specify \fBmpu\fR, the size table will not be +created at all (it wouldn't make any difference), although any \fBoverhead\fR +value will be respected during calculations. +.IP "atm, adsl" +.br +ATM linklayer consists of 53 byte cells, where each of them provides 48 bytes +for payload. Also all the cells must be fully utilized, thus the last one is +padded if/as necessary. + +When the size table is calculated, adjusted size that fits properly into lowest +amount of cells is assigned to a slot. For example, a 100 byte long packet +requires three 48\-byte payloads, so the final size would require 3 ATM cells +\- 159 bytes. + +For ATM size tables, 16\~bytes sized slots are perfectly enough. The default +values of \fBmtu\fR and \fBtsize\fR create 4\~bytes sized slots. +.PP +. +.SH "TYPICAL OVERHEADS" +The following values are typical for different adsl scenarios (based on +\fB[1]\fR and \fB[2]\fR): + +.nf +LLC based: +.RS 4 +PPPoA \- 14 (PPP \- 2, ATM \- 12) +PPPoE \- 40+ (PPPoE \- 8, ATM \- 18, ethernet 14, possibly FCS \- 4+padding) +Bridged \- 32 (ATM \- 18, ethernet 14, possibly FCS \- 4+padding) +IPoA \- 16 (ATM \- 16) +.RE + +VC Mux based: +.RS 4 +PPPoA \- 10 (PPP \- 2, ATM \- 8) +PPPoE \- 32+ (PPPoE \- 8, ATM \- 10, ethernet 14, possibly FCS \- 4+padding) +Bridged \- 24+ (ATM \- 10, ethernet 14, possibly FCS \- 4+padding) +IPoA \- 8 (ATM \- 8) +.RE +.fi +There are a few important things regarding the above overheads: +. +.IP \(bu 4 +IPoA in LLC case requires SNAP, instead of LLC\-NLPID (see rfc2684) \- this is +the reason why it actually takes more space than PPPoA. +.IP \(bu +In rare cases, FCS might be preserved on protocols that include Ethernet frames +(Bridged and PPPoE). In such situation, any Ethernet specific padding +guaranteeing 64 bytes long frame size has to be included as well (see RFC2684). +In the other words, it also guarantees that any packet you send will take +minimum 2 atm cells. You should set \fBmpu\fR accordingly for that. +.IP \(bu +When the size table is consulted, and you're shaping traffic for the sake of +another modem/router, an Ethernet header (without padding) will already be added +to initial packet's length. You should compensate for that by subtracting 14 +from the above overheads in this case. If you're shaping directly on the router +(for example, with speedtouch usb modem) using ppp daemon, you're using raw ip +interface without underlying layer2, so nothing will be added. + +For more thorough explanations, please see \fB[1]\fR and \fB[2]\fR. +. +.SH "ETHERNET CARDS CONSIDERATIONS" +. +It's often forgotten that modern network cards (even cheap ones on desktop +motherboards) and/or their drivers often support different offloading +mechanisms. In the context of traffic shaping, 'tso' and 'gso' might cause +undesirable effects, due to massive TCP segments being considered during +traffic shaping (including stab calculations). For slow uplink interfaces, +it's good to use \fBethtool\fR to turn off offloading features. +. +.SH "SEE ALSO" +. +\fBtc\fR(8), \fBtc\-hfsc\fR(7), \fBtc\-hfsc\fR(8), +.br +\fB[1]\fR http://ace\-host.stuart.id.au/russell/files/tc/tc\-atm/ +.br +\fB[2]\fR http://www.faqs.org/rfcs/rfc2684.html + +Please direct bugreports and patches to: <netdev@vger.kernel.org> +. +.SH "AUTHOR" +. +Manpage created by Michal Soltys (soltys@ziu.info) diff --git a/man/man8/tc-taprio.8 b/man/man8/tc-taprio.8 new file mode 100644 index 0000000..bf489b0 --- /dev/null +++ b/man/man8/tc-taprio.8 @@ -0,0 +1,256 @@ +.TH TAPRIO 8 "25 Sept 2018" "iproute2" "Linux" +.SH NAME +TAPRIO \- Time Aware Priority Shaper +.SH SYNOPSIS +.B tc qdisc ... dev +dev +.B parent +classid +.B [ handle +major: +.B ] taprio num_tc +tcs +.ti +8 +.B map +P0 P1 P2 ... +.B queues +count1@offset1 count2@offset2 ... +.ti +8 +.B base-time +base-time +.B clockid +clockid +.ti +8 +.B sched-entry +<command 1> <gate mask 1> <interval 1> +.ti +8 +.B sched-entry +<command 2> <gate mask 2> <interval 2> +.ti +8 +.B sched-entry +<command 3> <gate mask 3> <interval 3> +.ti +8 +.B sched-entry +<command N> <gate mask N> <interval N> +.ti +8 +[ +.B max-sdu +<queueMaxSDU[TC 0]> <queueMaxSDU[TC 1]> <queueMaxSDU[TC N]> ] +.ti +8 +[ +.B fp +<adminStatus[TC 0]> <adminStatus[TC 1]> <adminStatus[TC N]> ] + +.SH DESCRIPTION +The TAPRIO qdisc implements a simplified version of the scheduling +state machine defined by IEEE 802.1Q-2018 Section 8.6.9, which allows +configuration of a sequence of gate states, where each gate state +allows outgoing traffic for a subset (potentially empty) of traffic +classes. + +How traffic is mapped to different hardware queues is similar to +.BR mqprio(8) +and so the +.B map +and +.B queues +parameters have the same meaning. + +The other parameters specify the schedule, and at what point in time +it should start (it can behave as the schedule started in the past). + +.SH PARAMETERS +.TP +num_tc +.BR +Number of traffic classes to use. Up to 16 classes supported. + +.TP +map +.br +The priority to traffic class map. Maps priorities 0..15 to a specified +traffic class. See +.BR mqprio(8) +for more details. + +.TP +queues +.br +Provide count and offset of queue range for each traffic class. In the +format, +.B count@offset. +Queue ranges for each traffic classes cannot overlap and must be a +contiguous range of queues. + +.TP +base-time +.br +Specifies the instant in nanoseconds, using the reference of +.B clockid, +defining the time when the schedule starts. If 'base-time' is a time +in the past, the schedule will start at + +base-time + (N * cycle-time) + +where N is the smallest integer so the resulting time is greater than +"now", and "cycle-time" is the sum of all the intervals of the entries +in the schedule; + +.TP +clockid +.br +Specifies the clock to be used by qdisc's internal timer for measuring +time and scheduling events. This argument must be omitted when using the +full-offload feature (flags 0x2), since in that case, the clockid is +implicitly /dev/ptpN (where N is given by +.B ethtool -T eth0 | grep 'PTP Hardware Clock' +), and therefore not necessarily synchronized with the system's CLOCK_TAI. + +.TP +sched-entry +.br +There may multiple +.B sched-entry +parameters in a single schedule. Each one has the + +sched-entry <command> <gatemask> <interval> + +format. The only supported <command> is "S", which +means "SetGateStates", following the IEEE 802.1Q-2018 definition +(Table 8-7). <gate mask> is a bitmask where each bit is a associated +with a traffic class, so bit 0 (the least significant bit) being "on" +means that traffic class 0 is "active" for that schedule entry. +<interval> is a time duration, in nanoseconds, that specifies for how +long that state defined by <command> and <gate mask> should be held +before moving to the next entry. + +.TP +flags +.br +This is a bit mask which specifies different modes for taprio. +.RS +.TP +.I 0x1 +Enables the txtime-assist feature. In this mode, taprio will set the transmit +timestamp depending on the interval in which the packet needs to be +transmitted. It will then utililize the +.BR etf(8) +qdisc to sort and transmit the packets at the right time. The second example +can be used as a reference to configure this mode. +.TP +.I 0x2 +Enables the full-offload feature. In this mode, taprio will pass the gate +control list to the NIC which will execute it cyclically in hardware. +When using full-offload, there is no need to specify the +.B clockid +argument. + +The txtime-assist and full-offload features are mutually exclusive, i.e. +setting flags to 0x3 is invalid. +.RE + +.TP +txtime-delay +.br +This parameter is specific to the txtime offload mode. It specifies the maximum +time a packet might take to reach the network card from the taprio qdisc. The +value should always be greater than the delta specified in the +.BR etf(8) +qdisc. + +.TP +max-sdu +.br +Specifies an array containing at most 16 elements, one per traffic class, which +corresponds to the queueMaxSDU table from IEEE 802.1Q-2018. Each array element +represents the maximum L2 payload size that can egress that traffic class. +Elements that are not filled in default to 0. The value 0 means that the +traffic class can send packets up to the port's maximum MTU in size. + +.TP +fp +.br +Selects whether traffic classes are express or preemptible. See +.BR tc-mqprio(8) +for details. + +.SH EXAMPLES + +The following example shows how an traffic schedule with three traffic +classes ("num_tc 3"), which are separated different traffic classes, +we are going to call these TC 0, TC 1 and TC 2. We could read the +"map" parameter below as: traffic with priority 3 is classified as TC +0, priority 2 is classified as TC 1 and the rest is classified as TC +2. + +The schedule will start at instant 1528743495910289987 using the +reference CLOCK_TAI. The schedule is composed of three entries each of +300us duration. + +.EX +# tc qdisc replace dev eth0 parent root handle 100 stab overhead 24 taprio \\ + num_tc 3 \\ + map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ + queues 1@0 1@1 2@2 \\ + base-time 1528743495910289987 \\ + sched-entry S 01 300000 \\ + sched-entry S 02 300000 \\ + sched-entry S 04 300000 \\ + clockid CLOCK_TAI +.EE + +Following is an example to enable the txtime offload mode in taprio. See +.BR etf(8) +for more information about configuring the ETF qdisc. + +.EX +# tc qdisc replace dev eth0 parent root handle 100 stab overhead 24 taprio \\ + num_tc 3 \\ + map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ + queues 1@0 1@0 1@0 \\ + base-time 1528743495910289987 \\ + sched-entry S 01 300000 \\ + sched-entry S 02 300000 \\ + sched-entry S 04 400000 \\ + flags 0x1 \\ + txtime-delay 200000 \\ + clockid CLOCK_TAI + +# tc qdisc replace dev $IFACE parent 100:1 etf skip_skb_check \\ + offload delta 200000 clockid CLOCK_TAI +.EE + +The following is a schedule in full offload mode. The +.B base-time +is 200 ns and the +.B cycle-time +is implicitly calculated as the sum of all +.B sched-entry +durations (i.e. 20 us + 20 us + 60 us = 100 us). Although the base-time is in +the past, the hardware will start executing the schedule at a PTP time equal to +the smallest integer multiple of 100 us, plus 200 ns, that is larger than the +NIC's current PTP time. In addition, the MTU for traffic class 5 is limited to +200 octets, so that the interference this creates upon traffic class 7 during +the time window when their gates are both open is bounded. The interference is +determined by the transmit time of the max SDU, plus the L2 header length, plus +the L1 overhead (determined from the size table specified using +.BR tc-stab(8)). + +.EX +# tc qdisc add dev eth0 parent root stab overhead 24 taprio \\ + num_tc 8 \\ + map 0 1 2 3 4 5 6 7 \\ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \\ + max-sdu 0 0 0 0 0 200 0 0 \\ + base-time 200 \\ + sched-entry S 80 20000 \\ + sched-entry S a0 20000 \\ + sched-entry S 5f 60000 \\ + flags 0x2 +.EE + +.SH SEE ALSO +.BR tc-stab(8) + +.SH AUTHORS +Vinicius Costa Gomes <vinicius.gomes@intel.com> diff --git a/man/man8/tc-tbf.8 b/man/man8/tc-tbf.8 new file mode 100644 index 0000000..d721b5d --- /dev/null +++ b/man/man8/tc-tbf.8 @@ -0,0 +1,141 @@ +.TH TC 8 "13 December 2001" "iproute2" "Linux" +.SH NAME +tbf \- Token Bucket Filter +.SH SYNOPSIS +.B tc qdisc ... tbf rate +rate +.B burst +bytes/cell +.B ( latency +ms +.B | limit +bytes +.B ) [ mpu +bytes +.B [ peakrate +rate +.B mtu +bytes/cell +.B ] ] +.P +burst is also known as buffer and maxburst. mtu is also known as minburst. +.SH DESCRIPTION + +The Token Bucket Filter is a classful queueing discipline available for +traffic control with the +.BR tc (8) +command. + +TBF is a pure shaper and never schedules traffic. It is non-work-conserving and may throttle +itself, although packets are available, to ensure that the configured rate is not exceeded. +It is able to shape up to 1mbit/s of normal traffic with ideal minimal burstiness, +sending out data exactly at the configured rates. + +Much higher rates are possible but at the cost of losing the minimal burstiness. In that +case, data is on average dequeued at the configured rate but may be sent much faster at millisecond +timescales. Because of further queues living in network adaptors, this is often not a problem. + +.SH ALGORITHM +As the name implies, traffic is filtered based on the expenditure of +.B tokens. +Tokens roughly correspond to bytes, with the additional constraint +that each packet consumes some tokens, no matter how small it is. This +reflects the fact that even a zero-sized packet occupies the link for +some time. + +On creation, the TBF is stocked with tokens which correspond to the amount of traffic that can be burst +in one go. Tokens arrive at a steady rate, until the bucket is full. + +If no tokens are available, packets are queued, up to a configured limit. The TBF now +calculates the token deficit, and throttles until the first packet in the queue can be sent. + +If it is not acceptable to burst out packets at maximum speed, a peakrate can be configured +to limit the speed at which the bucket empties. This peakrate is implemented as a second TBF +with a very small bucket, so that it doesn't burst. + +To achieve perfection, the second bucket may contain only a single packet, which leads to +the earlier mentioned 1mbit/s limit. + +This limit is caused by the fact that the kernel can only throttle for at minimum 1 'jiffy', which depends +on HZ as 1/HZ. For perfect shaping, only a single packet can get sent per jiffy - for HZ=100, this means 100 +packets of on average 1000 bytes each, which roughly corresponds to 1mbit/s. + +.SH PARAMETERS +See +.BR tc (8) +for how to specify the units of these values. +.TP +limit or latency +Limit is the number of bytes that can be queued waiting for tokens to become +available. You can also specify this the other way around by setting the +latency parameter, which specifies the maximum amount of time a packet can +sit in the TBF. The latter calculation takes into account the size of the +bucket, the rate and possibly the peakrate (if set). These two parameters +are mutually exclusive. +.TP +burst +Also known as buffer or maxburst. +Size of the bucket, in bytes. This is the maximum amount of bytes that tokens can be available for instantaneously. +In general, larger shaping rates require a larger buffer. For 10mbit/s on Intel, you need at least 10kbyte buffer +if you want to reach your configured rate! + +If your buffer is too small, packets may be dropped because more tokens arrive per timer tick than fit in your bucket. +The minimum buffer size can be calculated by dividing the rate by HZ. + +Token usage calculations are performed using a table which by default has a resolution of 8 packets. +This resolution can be changed by specifying the +.B cell +size with the burst. For example, to specify a 6000 byte buffer with a 16 +byte cell size, set a burst of 6000/16. You will probably never have to set +this. Must be an integral power of 2. +.TP +mpu +A zero-sized packet does not use zero bandwidth. For ethernet, no packet uses less than 64 bytes. The Minimum Packet Unit +determines the minimal token usage (specified in bytes) for a packet. Defaults to zero. +.TP +rate +The speed knob. See remarks above about limits! See +.BR tc (8) +for units. +.PP +Furthermore, if a peakrate is desired, the following parameters are available: + +.TP +peakrate +Maximum depletion rate of the bucket. The peakrate does not +need to be set, it is only necessary if perfect millisecond timescale +shaping is required. + +.TP +mtu/minburst +Specifies the size of the peakrate bucket. For perfect accuracy, should be set to the MTU of the interface. +If a peakrate is needed, but some burstiness is acceptable, this size can be raised. A 3000 byte minburst +allows around 3mbit/s of peakrate, given 1000 byte packets. + +Like the regular burstsize you can also specify a +.B cell +size. +.SH EXAMPLE & USAGE + +To attach a TBF with a sustained maximum rate of 0.5mbit/s, a peakrate of 1.0mbit/s, +a 5kilobyte buffer, with a pre-bucket queue size limit calculated so the TBF causes +at most 70ms of latency, with perfect peakrate behaviour, issue: +.P +# tc qdisc add dev eth0 handle 10: root tbf rate 0.5mbit \\ + burst 5kb latency 70ms peakrate 1mbit \\ + minburst 1540 +.P +To attach an inner qdisc, for example sfq, issue: +.P +# tc qdisc add dev eth0 parent 10:1 handle 100: sfq +.P +Without inner qdisc TBF queue acts as bfifo. If the inner qdisc is changed +the limit/latency is not effective anymore. +.P + +.SH SEE ALSO +.BR tc (8) + +.SH AUTHOR +Alexey N. Kuznetsov, <kuznet@ms2.inr.ac.ru>. This manpage maintained by +bert hubert <ahu@ds9a.nl> diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8 new file mode 100644 index 0000000..b987cd0 --- /dev/null +++ b/man/man8/tc-tunnel_key.8 @@ -0,0 +1,175 @@ +.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" "Linux" + +.SH NAME +tunnel_key - Tunnel metadata manipulation +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action tunnel_key" " { " unset " | " +.IR SET " }" + +.ti -8 +.IR SET " := " +.BR set " " src_ip +.IR ADDRESS +.BR dst_ip +.IR ADDRESS +.BI id " KEY_ID" +.BI dst_port " UDP_PORT" +.BI tos " TOS" +.BI ttl " TTL" +.RB "[ " csum " | " nocsum " ]" + +.SH DESCRIPTION +The +.B tunnel_key +action combined with a shared IP tunnel device, allows one to perform IP tunnel en- +or decapsulation on a packet, reflected by +the operation modes +.IR UNSET " and " SET . +The +.I UNSET +mode is optional - even without using it, the metadata information will be +released automatically when packet processing will be finished. +.IR UNSET +function could be used in cases when traffic is forwarded between two tunnels, +where the metadata from the first tunnel will be used for encapsulation done by +the second tunnel. +.IR SET +mode requires the source and destination ip +.I ADDRESS +and the tunnel key id +.I KEY_ID +which will be used by the ip tunnel shared device to create the tunnel header. The +.B tunnel_key +action is useful only in combination with a +.B mirred redirect +action to a shared IP tunnel device which will use the metadata (for +.I SET +) and unset the metadata created by it (for +.I UNSET +). + +.SH OPTIONS +.TP +.B unset +Unset the tunnel metadata created by the IP tunnel device. This function is +not mandatory and might be used only in some specific use cases (as explained +above). +.TP +.B set +Set tunnel metadata to be used by the IP tunnel device. Requires +.B src_ip +and +.B dst_ip +options. +.B id +, +.B dst_port +, +.B geneve_opts +, +.B vxlan_opts +and +.B erspan_opts +are optional. +.RS +.TP +.B id +Tunnel ID (for example VNI in VXLAN tunnel) +.TP +.B src_ip +Outer header source IP address (IPv4 or IPv6) +.TP +.B dst_ip +Outer header destination IP address (IPv4 or IPv6) +.TP +.B dst_port +Outer header destination UDP port +.TP +.B geneve_opts +Geneve variable length options. +.B geneve_opts +is specified in the form CLASS:TYPE:DATA, where CLASS is represented as a +16bit hexadecimal value, TYPE as an 8bit hexadecimal value and DATA as a +variable length hexadecimal value. Additionally multiple options may be +listed using a comma delimiter. +.TP +.B vxlan_opts +Vxlan metadata options. +.B vxlan_opts +is specified in the form GBP, as a 32bit number. Multiple options is not +supported. +.TP +.B erspan_opts +Erspan metadata options. +.B erspan_opts +is specified in the form VERSION:INDEX:DIR:HWID, where VERSION is represented +as a 8bit number, INDEX as an 32bit number, DIR and HWID as a 8bit number. +Multiple options is not supported. Note INDEX is used when VERSION is 1, +and DIR and HWID are used when VERSION is 2. +.TP +.B tos +Outer header TOS +.TP +.B ttl +Outer header TTL +.TP +.RB [ no ] csum +Controls outer UDP checksum. When set to +.B csum +(which is default), the outer UDP checksum is calculated and included in the +packets. When set to +.BR nocsum , +outer UDP checksum is zero. Note that when using zero UDP checksums with +IPv6, the other tunnel endpoint must be configured to accept such packets. +In Linux, this would be the +.B udp6zerocsumrx +option for the VXLAN tunnel interface. +.IP +If using +.B nocsum +with IPv6, be sure you know what you are doing. Zero UDP checksums provide +weaker protection against corrupted packets. See RFC6935 for details. +.TP +.B nofrag +disallow IP fragmentation. +.RE +.SH EXAMPLES +The following example encapsulates incoming ICMP packets on eth0 into a vxlan +tunnel, by setting metadata to VNI 11, source IP 11.11.0.1 and destination IP +11.11.0.2, and by redirecting the packet with the metadata to device vxlan0, +which will do the actual encapsulation using the metadata: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev eth0 protocol ip parent ffff: \\ + flower \\ + ip_proto icmp \\ + action tunnel_key set \\ + src_ip 11.11.0.1 \\ + dst_ip 11.11.0.2 \\ + id 11 \\ + action mirred egress redirect dev vxlan0 +.EE +.RE + +Here is an example of the +.B unset +function: Incoming VXLAN traffic with outer IP's and VNI 11 is decapsulated by +vxlan0 and metadata is unset before redirecting to tunl1 device: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev vxlan0 protocol ip parent ffff: \ + flower \\ + enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 \ + action tunnel_key unset \ + action mirred egress redirect dev tunl1 +.EE +.RE + +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-u32.8 b/man/man8/tc-u32.8 new file mode 100644 index 0000000..dfbf73e --- /dev/null +++ b/man/man8/tc-u32.8 @@ -0,0 +1,674 @@ +.TH "Universal 32bit classifier in tc" 8 "25 Sep 2015" "iproute2" "Linux" + +.SH NAME +u32 \- universal 32bit traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... [ " handle +.IR HANDLE " ] " +.B u32 +.IR OPTION_LIST " [ " +.B offset +.IR OFFSET " ] [ " +.B hashkey +.IR HASHKEY " ] [ " +.B classid +.IR CLASSID " ] [ " +.B divisor +.IR uint_value " ] [ " +.B order +.IR u32_value " ] [ " +.B ht +.IR HANDLE " ] [ " +.B sample +.IR SELECTOR " [ " +.B divisor +.IR uint_value " ] ] [ " +.B link +.IR HANDLE " ] [ " +.B indev +.IR ifname " ] [ " +.BR skip_hw " | " +.BR skip_sw " ] [ " +.BR help " ]" + +.ti -8 +.IR HANDLE " := { " +\fIu12_hex_htid\fB:\fR[\fIu8_hex_hash\fB:\fR[\fIu12_hex_nodeid\fR] | \fB0x\fIu32_hex_value\fR } + +.ti -8 +.IR OPTION_LIST " := [ " OPTION_LIST " ] " OPTION + +.ti -8 +.IR HASHKEY " := [ " +.B mask +.IR u32_hex_value " ] [ " +.B at +.IR 4*int_value " ]" + +.ti -8 +.IR CLASSID " := { " +.BR root " | " +.BR none " | " +[\fIu16_major\fR]\fB:\fIu16_minor\fR | \fIu32_hex_value\fR } + +.ti -8 +.IR OFFSET " := [ " +.B plus +.IR int_value " ] [ " +.B at +.IR 2*int_value " ] [ " +.B mask +.IR u16_hex_value " ] [ " +.B shift +.IR int_value " ] [ " +.BR eat " ]" + +.ti -8 +.IR OPTION " := { " +.B match +.IR SELECTOR " | " +.B action +.IR ACTION " } " + +.ti -8 +.IR SELECTOR " := { " +.B u32 +.IR VAL_MASK_32 " | " +.B u16 +.IR VAL_MASK_16 " | " +.B u8 +.IR VAL_MASK_8 " | " +.B ip +.IR IP " | " +.B ip6 +.IR IP6 " | { " +.BR tcp " | " udp " } " +.IR TCPUDP " | " +.B icmp +.IR ICMP " | " +.B mark +.IR VAL_MASK_32 " | " +.B ether +.IR ETHER " }" + +.ti -8 +.IR IP " := { { " +.BR src " | " dst " } { " default " | " any " | " all " | " +.IR ip_address " [ " +.BR / " { " +.IR prefixlen " | " netmask " } ] } " AT " | { " +.BR dsfield " | " ihl " | " protocol " | " precedence " | " +.BR icmp_type " | " icmp_code " } " +.IR VAL_MASK_8 " | { " +.BR sport " | " dport " } " +.IR VAL_MASK_16 " | " +.BR nofrag " | " firstfrag " | " df " | " mf " }" + +.ti -8 +.IR IP6 " := { { " +.BR src " | " dst " } { " default " | " any " | " all " | " +.IR ip6_address " [/" prefixlen " ] } " AT " | " +.B priority +.IR VAL_MASK_8 " | { " +.BR protocol " | " icmp_type " | " icmp_code " } " +.IR VAL_MASK_8 " | " +.B flowlabel +.IR VAL_MASK_32 " | { " +.BR sport " | " dport " } " +.IR VAL_MASK_16 " }" + +.ti -8 +.IR TCPUDP " := { " +.BR src " | " dst " } " +.I VAL_MASK_16 + +.ti -8 +.IR ICMP " := { " +.B type +.IR VAL_MASK_8 " | " +.B code +.IR VAL_MASK_8 " }" + +.ti -8 +.IR ETHER " := { " +.BR src " | " dst " } " +.IR ether_address " " AT + +.ti -8 +.IR VAL_MASK_32 " := " u32_value " " u32_hex_mask " [ " AT " ]" + +.ti -8 +.IR VAL_MASK_16 " := " u16_value " " u16_hex_mask " [ " AT " ]" + +.ti -8 +.IR VAL_MASK_8 " := " u8_value " " u8_hex_mask " [ " AT " ]" + +.ti -8 +.IR AT " := [ " +.BR at " [ " nexthdr+ " ] " +.IR int_value " ]" +.SH DESCRIPTION +The Universal/Ugly 32bit filter allows one to match arbitrary bitfields in the +packet. Due to breaking everything down to values, masks and offsets, It is +equally powerful and hard to use. Luckily many abstracting directives are +present which allow defining rules on a higher level and therefore free the +user from having to fiddle with bits and masks in many cases. + +There are two general modes of invocation: The first mode creates a new filter +to delegate packets to different destinations. Apart from the obvious ones, +namely classifying the packet by specifying a +.I CLASSID +or calling an +.BR action , +one may +.B link +one filter to another one (or even a list of them), effectively organizing +filters into a tree-like hierarchy. + +Typically filter delegation is done by means of a hash table, which leads to the +second mode of invocation: it merely serves to set up these hash tables. Filters +can select a hash table and provide a key selector from which a hash is to be +computed and used as key to lookup the table's bucket which contains filters for +further processing. This is useful if a high number of filters is in use, as the +overhead of performing the hash operation and table lookup becomes negligible in +that case. Using hashtables with +.B u32 +basically involves the following pattern: +.IP (1) 4 +Creating a new hash table, specifying it's size using the +.B divisor +parameter and ideally a handle by which the table can be identified. If the +latter is not given, the kernel chooses one on it's own, which has to be +guessed later. +.IP (2) 4 +Creating filters which link to the created table in +.I (1) +using the +.B link +parameter and defining the packet data which the kernel will use to calculate +the +.BR hashkey . +.IP (3) 4 +Adding filters to buckets in the hash table from +.IR (1) . +In order to avoid having to know how exactly the kernel creates the hash key, +there is the +.B sample +parameter, which gives sample data to hash and thereby define the table bucket +the filter should be added to. + +.RE +In fact, even if not explicitly requested +.B u32 +creates a hash table for every +.B priority +a filter is being added with. The table's size is 1 though, so it is in fact +merely a linked list. +.SH VALUES +Options and selectors require values to be specified in a specific format, which +is often non-intuitive. Therefore the terminals in +.I SYNOPSIS +have been given descriptive names to indicate the required format and/or maximum +allowed numeric value: Prefixes +.IR u32 ", " u16 " and " u8 +indicate four, two and single byte unsigned values. E.g. +.I u16 +indicates a two byte-sized value in range between 0 and 65535 (0xFFFF) +inclusive. A prefix of +.I int +indicates a four byte signed value. A middle part of +.I _hex_ +indicates that the value is parsed in hexadecimal format. Otherwise, the +value's base is automatically detected, i.e. values prefixed with +.I 0x +are considered hexadecimal, a leading +.I 0 +indicates octal format and decimal format otherwise. There are some values with +special formatting as well: +.IR ip_address " and " netmask +are in dotted-quad formatting as usual for IPv4 addresses. An +.I ip6_address +is specified in common, colon-separated hexadecimal format. Finally, +.I prefixlen +is an unsigned, decimal integer value in range from 0 to the address width in +bits (32 for IPv4 and 128 for IPv6). + +Sometimes values need to be dividable by a certain number. In that case a name +of the form +.I N*val +was chosen, indicating that +.I val +must be dividable by +.IR N . +Or the other way around: the resulting value must be a multiple of +.IR N . +.SH OPTIONS +.B U32 +recognizes the following options: +.TP +.BI handle " HANDLE" +The handle is used to reference a filter and therefore must be unique. It +consists of a hash table identifier +.B htid +and optional +.B hash +(which identifies the hash table's bucket) and +.BR nodeid . +All these values are parsed as unsigned, hexadecimal numbers with length 12bits +( +.BR htid " and " nodeid ) +or 8bits ( +.BR hash ). +Alternatively one may specify a single, 32bit long hex number which contains +the three fields bits in concatenated form. Other than the fields themselves, it +has to be prefixed by +.BR 0x . +.TP +.BI offset " OFFSET" +Set an offset which defines where matches of subsequent filters are applied to. +Therefore this option is useful only when combined with +.BR link " or a combination of " ht " and " sample . +The offset may be given explicitly by using the +.B plus +keyword, or extracted from the packet data with +.BR at . +It is possible to mangle the latter using +.BR mask " and/or " shift +keywords. By default, this offset is recorded but not implicitly applied. It is +used only to substitute the +.B nexthdr+ +statement. Using the keyword +.B eat +though inverses this behaviour: the offset is applied always, and +.B nexthdr+ +will fall back to zero. +.TP +.BI hashkey " HASHKEY" +Specify what packet data to use to calculate a hash key for bucket lookup. The +kernel adjusts the value according to the hash table's size. For this to work, +the option +.B link +must be given. +.TP +.BI classid " CLASSID" +Classify matching packets into the given +.IR CLASSID , +which consists of either 16bit +.BR major " and " minor +numbers or a single 32bit value combining both. +.TP +.BI divisor " u32_value" +Specify a modulo value. Used when creating hash tables to define their size or +for declaring a +.B sample +to calculate hash table keys from. Must be a power of two with exponent not +exceeding eight. +.TP +.BI order " u32_value" +A value to order filters by, ascending. Conflicts with +.B handle +which serves the same purpose. +.TP +.BI sample " SELECTOR" +Used together with +.B ht +to specify which bucket to add this filter to. This allows one to avoid having +to know how exactly the kernel calculates hashes. The additional +.B divisor +defaults to 256, so must be given for hash tables of different size. +.TP +.BI link " HANDLE" +Delegate matching packets to filters in a hash table. +.I HANDLE +is used to only specify the hash table, so only +.BR htid " may be given, " hash " and " nodeid +have to be omitted. By default, bucket number 0 will be used and can be +overridden by the +.B hashkey +option. +.TP +.BI indev " ifname" +Filter on the incoming interface of the packet. Obviously works only for +forwarded traffic. +.TP +.BI skip_sw +Do not process filter by software. If hardware has no offload support for this +filter, or TC offload is not enabled for the interface, operation will fail. +.TP +.BI skip_hw +Do not process filter by hardware. +.TP +.BI help +Print a brief help text about possible options. +.SH SELECTORS +Basically the only real selector is +.B u32 . +All others merely provide a higher level syntax and are internally translated +into +.B u32 . +.TP +.BI u32 " VAL_MASK_32" +.TQ +.BI u16 " VAL_MASK_16" +.TQ +.BI u8 " VAL_MASK_8" +Match packet data to a given value. The selector name defines the sample length +to extract (32bits for +.BR u32 , +16bits for +.B u16 +and 8bits for +.BR u8 ). +Before comparing, the sample is binary AND'ed with the given mask. This way +uninteresting bits can be cleared before comparison. The position of the sample +is defined by the offset specified in +.IR AT . +.TP +.BI ip " IP" +.TQ +.BI ip6 " IP6" +Assume packet starts with an IPv4 ( +.BR ip ) +or IPv6 ( +.BR ip6 ) +header. +.IR IP / IP6 +then allows one to match various header fields: +.RS +.TP +.BI src " ADDR" +.TQ +.BI dst " ADDR" +Compare Source or Destination Address fields against the value of +.IR ADDR . +The reserved words +.BR default ", " any " and " all +effectively match any address. Otherwise an IP address of the particular +protocol is expected, optionally suffixed by a prefix length to match whole +subnets. In case of IPv4 a netmask may also be given. +.TP +.BI dsfield " VAL_MASK_8" +IPv4 only. Match the packet header's DSCP/ECN field. Synonyms to this are +.BR tos " and " precedence . +.TP +.BI ihl " VAL_MASK_8" +IPv4 only. Match the Internet Header Length field. Note that the value's unit is +32bits, so to match a packet with 24byte header length +.I u8_value +has to be 6. +.TP +.BI protocol " VAL_MASK_8" +Match the Protocol (IPv4) or Next Header (IPv6) field value, e.g. 6 for TCP. +.TP +.BI icmp_type " VAL_MASK_8" +.TQ +.BI icmp_code " VAL_MASK_8" +Assume a next-header protocol of icmp or ipv6-icmp and match Type or Code +field values. This is dangerous, as the code assumes minimal header size for +IPv4 and lack of extension headers for IPv6. +.TP +.BI sport " VAL_MASK_16" +.TQ +.BI dport " VAL_MASK_16" +Match layer four source or destination ports. This is dangerous as well, as it +assumes a suitable layer four protocol is present (which has Source and +Destination Port fields right at the start of the header and 16bit in size). +Also minimal header size for IPv4 and lack of IPv6 extension headers is assumed. +.TP +.B nofrag +.TQ +.B firstfrag +.TQ +.B df +.TQ +.B mf +IPv4 only, check certain flags and fragment offset values. Match if the packet +is not a fragment +.RB ( nofrag ), +the first fragment of a fragmented packet +.RB ( firstfrag ), +if Don't Fragment +.RB ( df ) +or More Fragments +.RB ( mf ) +bits are set. +.TP +.BI priority " VAL_MASK_8" +IPv6 only. Match the header's Traffic Class field, which has the same purpose +and semantics of IPv4's ToS field since RFC 3168: upper six bits are DSCP, the +lower two ECN. +.TP +.BI flowlabel " VAL_MASK_32" +IPv6 only. Match the Flow Label field's value. Note that Flow Label itself is +only 20bytes long, which are the least significant ones here. The remaining +upper 12bytes match Version and Traffic Class fields. +.RE +.TP +.BI tcp " TCPUDP" +.TQ +.BI udp " TCPUDP" +Match fields of next header of protocol TCP or UDP. The possible values for +.I TCPDUP +are: +.RS +.TP +.BI src " VAL_MASK_16" +Match on Source Port field value. +.TP +.BI dst " VALMASK_16" +Match on Destination Port field value. +.RE +.TP +.BI icmp " ICMP" +Match fields of next header of protocol ICMP. The possible values for +.I ICMP +are: +.RS +.TP +.BI type " VAL_MASK_8" +Match on ICMP Type field. +.TP +.BI code " VAL_MASK_8" +Match on ICMP Code field. +.RE +.TP +.BI mark " VAL_MASK_32" +Match on netfilter fwmark value. +.TP +.BI ether " ETHER" +Match on ethernet header fields. Possible values for +.I ETHER +are: +.RS +.TP +.BI src " ether_address" " " AT +.TQ +.BI dst " ether_address" " " AT +Match on source or destination ethernet address. This is dangerous: It assumes +an ethernet header is present at the start of the packet. This will probably +lead to unexpected things if used with layer three interfaces like e.g. tun or +ppp. +.RE +.SH EXAMPLES +.RS +.EX +tc filter add dev eth0 parent 999:0 prio 99 protocol ip u32 \\ + match ip src 192.168.8.0/24 classid 1:1 +.EE +.RE + +This attaches a filter to the qdisc identified by +.BR 999:0. +It's priority is +.BR 99 , +which affects in which order multiple filters attached to the same +.B parent +are consulted (the lower the earlier). The filter handles packets of +.B protocol +type +.BR ip , +and +.BR match es +if the IP header's source address is within the +.B 192.168.8.0/24 +subnet. Matching packets are classified into class +.BR 1.1 . +The effect of this command might be surprising at first glance: + +.RS +.EX +filter parent 1: protocol ip pref 99 u32 +filter parent 1: protocol ip pref 99 u32 \\ + fh 800: ht divisor 1 +filter parent 1: protocol ip pref 99 u32 \\ + fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 \\ + match c0a80800/ffffff00 at 12 +.EE +.RE + +So parent +.B 1: +is assigned a new +.B u32 +filter, which contains a hash table of size 1 (as the +.B divisor +indicates). The table ID is +.BR 800 . +The third line then shows the actual filter which was added above: it sits in +table +.B 800 +and bucket +.BR 0 , +classifies packets into class ID +.B 1:1 +and matches the upper three bytes of the four byte value at offset +.B 12 +to be +.BR 0xc0a808 , +which is 192, 168 and 8. + +Now for something more complicated, namely creating a custom hash table: + +.RS +.EX +tc filter add dev eth0 prio 99 handle 1: u32 divisor 256 +.EE +.RE + +This creates a table of size 256 with handle +.B 1: +in priority +.BR 99 . +The effect is as follows: + +.RS +.EX +filter parent 1: protocol all pref 99 u32 +filter parent 1: protocol all pref 99 u32 fh 1: ht divisor 256 +filter parent 1: protocol all pref 99 u32 fh 800: ht divisor 1 +.EE +.RE + +So along with the requested hash table (handle +.BR 1: ), +the kernel has created his own table of size 1 to hold other filters of the same +priority. + +The next step is to create a filter which links to the created hash table: + +.RS +.EX +tc filter add dev eth0 parent 1: prio 1 u32 \\ + link 1: hashkey mask 0x0000ff00 at 12 \\ + match ip src 192.168.0.0/16 +.EE +.RE + +The filter is given a lower priority than the hash table itself so +.B u32 +consults it before manually traversing the hash table. The options +.BR link " and " hashkey +determine which table and bucket to redirect to. In this case the hash key +should be constructed out of the second byte at offset 12, which corresponds to +an IP packet's third byte of the source address field. Along with the +.B match +statement, this effectively maps all class C networks below 192.168.0.0/16 to +different buckets of the hash table. + +Filters for certain subnets can be created like so: + +.RS +.EX +tc filter add dev eth0 parent 1: prio 99 u32 \\ + ht 1: sample u32 0x00000800 0x0000ff00 at 12 \\ + match ip src 192.168.8.0/24 classid 1:1 +.EE +.RE + +The bucket is defined using the +.B sample +option: In this case, the second byte at offset 12 must be 0x08, exactly. In +this case, the resulting bucket ID is obviously 8, but as soon as +.B sample +selects an amount of data which could exceed the +.BR divisor , +one would have to know the kernel-internal algorithm to deduce the destination +bucket. This filter's +.B match +statement is redundant in this case, as the entropy for the hash key does not +exceed the table size and therefore no collisions can occur. Otherwise it's +necessary to prevent matching unwanted packets. + +Matching upper layer fields is problematic since IPv4 header length is variable +and IPv6 supports extension headers which affect upper layer header offset. To +overcome this, there is the possibility to specify +.B nexthdr+ +when giving an offset, and to make things easier there are the +.BR tcp " and " udp +matches which use +.B nexthdr+ +implicitly. This offset has to be calculated in beforehand though, and the only +way to achieve that is by doing it in a separate filter which then links to the +filter which wants to use it. Here is an example of doing so: + +.RS +.EX +tc filter add dev eth0 parent 1:0 protocol ip handle 1: \\ + u32 divisor 1 +tc filter add dev eth0 parent 1:0 protocol ip \\ + u32 ht 1: \\ + match tcp src 22 FFFF \\ + classid 1:2 +tc filter add dev eth0 parent 1:0 protocol ip \\ + u32 ht 800: \\ + match ip protocol 6 FF \\ + match u16 0 1fff at 6 \\ + offset at 0 mask 0f00 shift 6 \\ + link 1: +.EE +.RE + +This is what is being done: In the first call, a single element sized hash table +is created so there is a place to hold the linked to filter and a known handle +.RB ( 1: ) +to reference to it. The second call then adds the actual filter, which pushes +packets with TCP source port 22 into class +.BR 1:2 . +Using +.BR ht , +it is moved into the hash table created by the first call. The third call then +does the actual magic: It matches IPv4 packets with next layer protocol 6 (TCP), +only if it's the first fragment (usually TCP sets DF bit, but if it doesn't and +the packet is fragmented, only the first one contains the TCP header), and then +sets the offset based on the IP header's IHL field (right-shifting by 6 +eliminates the offset of the field and at the same time converts the value into +byte unit). Finally, using +.BR link , +the hash table from first call is referenced which holds the filter from second +call. +.SH SEE ALSO +.BR tc (8), +.br +.BR cls_u32.txt " at " http://linux-tc-notes.sourceforge.net/ diff --git a/man/man8/tc-vlan.8 b/man/man8/tc-vlan.8 new file mode 100644 index 0000000..e199c9a --- /dev/null +++ b/man/man8/tc-vlan.8 @@ -0,0 +1,164 @@ +.TH "VLAN manipulation action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +vlan - vlan manipulation module +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action vlan" " { " pop " | " pop_eth " |" +.IR PUSH " | " MODIFY " | " PUSH_ETH " } [ " CONTROL " ]" + +.ti -8 +.IR PUSH " := " +.BR push " [ " protocol +.IR VLANPROTO " ]" +.BR " [ " priority +.IR VLANPRIO " ] " +.BI id " VLANID" + +.ti -8 +.IR MODIFY " := " +.BR modify " [ " protocol +.IR VLANPROTO " ]" +.BR " [ " priority +.IR VLANPRIO " ] " +.BI id " VLANID" + +.ti -8 +.IR PUSH_ETH " := " +.B push_eth +.BI dst_mac " LLADDR " src_mac " LLADDR " + +.ti -8 +.IR CONTROL " := { " +.BR reclassify " | " pipe " | " drop " | " continue " | " pass " | " goto " " chain " " CHAIN_INDEX " }" +.SH DESCRIPTION +The +.B vlan +action allows one to perform 802.1Q en- or decapsulation on a packet, reflected by +the operation modes +.IR POP ", " PUSH " and " MODIFY . +The +.I POP +mode is simple, as no further information is required to just drop the +outer-most VLAN encapsulation. The +.IR PUSH " and " MODIFY +modes require at least a +.I VLANID +and allow one to optionally choose the +.I VLANPROTO +to use. + +The +.B vlan +action can also be used to add or remove the base Ethernet header. The +.B pop_eth +mode, which takes no argument, is used to remove the base Ethernet header. All +existing VLANs must have been previously dropped. The opposite operation, +adding a base Ethernet header, is done with the +.B push_eth +mode. In that case, the packet must have no MAC header (stacking MAC headers is +not permitted). This mode is mostly useful when a previous action has +encapsulated the whole original frame behind a network header and one needs +to prepend an Ethernet header before forwarding the resulting packet. + +.SH OPTIONS +.TP +.B pop +Decapsulation mode, no further arguments allowed. +.TP +.B push +Encapsulation mode. Requires at least +.B id +option. +.TP +.B modify +Replace mode. Existing 802.1Q tag is replaced. Requires at least +.B id +option. +.TP +.B pop_eth +Ethernet header decapsulation mode. Only works on a plain Ethernet header: +VLANs, if any, must be removed first. +.TP +.B push_eth +Ethernet header encapsulation mode. The Ethertype is automatically set +using the network header type. Chaining Ethernet headers is not allowed: the +packet must have no MAC header when using this mode. Requires the +.BR "dst_mac " and " src_mac " options. +.TP +.BI id " VLANID" +Specify the VLAN ID to encapsulate into. +.I VLANID +is an unsigned 16bit integer, the format is detected automatically (e.g. prefix +with +.RB ' 0x ' +for hexadecimal interpretation, etc.). +.TP +.BI protocol " VLANPROTO" +Choose the VLAN protocol to use. At the time of writing, the kernel accepts only +.BR 802.1Q " or " 802.1ad . +.TP +.BI priority " VLANPRIO" +Choose the VLAN priority to use. Decimal number in range of 0-7. +.TP +.BI dst_mac " LLADDR" +Choose the destination MAC address to use. +.TP +.BI src_mac " LLADDR" +Choose the source MAC address to use. +.TP +.I CONTROL +How to continue after executing this action. +.RS +.TP +.B reclassify +Restarts classification by jumping back to the first filter attached to this +action's parent. +.TP +.B pipe +Continue with the next action, this is the default. +.TP +.B drop +Packet will be dropped without running further actions. +.TP +.B continue +Continue classification with next filter in line. +.TP +.B pass +Return to calling qdisc for packet processing. This ends the classification +process. +.RE +.SH EXAMPLES +The following example encapsulates incoming ICMP packets on eth0 from 10.0.0.2 +into VLAN ID 123: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev eth0 parent ffff: pref 11 protocol ip \\ + u32 match ip protocol 1 0xff flowid 1:1 \\ + match ip src 10.0.0.2 flowid 1:1 \\ + action vlan push id 123 +.EE +.RE + +Here is an example of the +.B pop +function: Incoming VLAN packets on eth0 are decapsulated and the classification +process then restarted for the plain packet: + +.RS +.EX +#tc qdisc add dev eth0 handle ffff: ingress +#tc filter add dev $ETH parent ffff: pref 1 protocol 802.1Q \\ + u32 match u32 0 0 flowid 1:1 \\ + action vlan pop reclassify +.EE +.RE + +For an example of the +.BR pop_eth " and " push_eth " modes, see " tc-mpls (8). + +.SH SEE ALSO +.BR tc "(8), " tc-mpls (8) diff --git a/man/man8/tc.8 b/man/man8/tc.8 new file mode 100644 index 0000000..3175454 --- /dev/null +++ b/man/man8/tc.8 @@ -0,0 +1,893 @@ +.TH TC 8 "16 December 2001" "iproute2" "Linux" +.SH NAME +tc \- show / manipulate traffic control settings +.SH SYNOPSIS +.B tc +.RI "[ " OPTIONS " ]" +.B qdisc [ add | change | replace | link | delete ] dev +\fIDEV\fR +.B +[ parent +\fIqdisc-id\fR +.B | root ] +.B [ handle +\fIqdisc-id\fR ] +.B [ ingress_block +\fIBLOCK_INDEX\fR ] +.B [ egress_block +\fIBLOCK_INDEX\fR ] qdisc +[ qdisc specific parameters ] +.P + +.B tc +.RI "[ " OPTIONS " ]" +.B class [ add | change | replace | delete | show ] dev +\fIDEV\fR +.B parent +\fIqdisc-id\fR +.B [ classid +\fIclass-id\fR ] qdisc +[ qdisc specific parameters ] +.P + +.B tc +.RI "[ " OPTIONS " ]" +.B filter [ add | change | replace | delete | get ] dev +\fIDEV\fR +.B [ parent +\fIqdisc-id\fR +.B | root ] [ handle \fIfilter-id\fR ] +.B protocol +\fIprotocol\fR +.B prio +\fIpriority\fR filtertype +[ filtertype specific parameters ] +.B flowid +\fIflow-id\fR + +.B tc +.RI "[ " OPTIONS " ]" +.B filter [ add | change | replace | delete | get ] block +\fIBLOCK_INDEX\fR +.B [ handle \fIfilter-id\fR ] +.B protocol +\fIprotocol\fR +.B prio +\fIpriority\fR filtertype +[ filtertype specific parameters ] +.B flowid +\fIflow-id\fR + +.B tc +.RI "[ " OPTIONS " ]" +.B chain [ add | delete | get ] dev +\fIDEV\fR +.B [ parent +\fIqdisc-id\fR +.B | root ]\fR filtertype +[ filtertype specific parameters ] + +.B tc +.RI "[ " OPTIONS " ]" +.B chain [ add | delete | get ] block +\fIBLOCK_INDEX\fR filtertype +[ filtertype specific parameters ] + + +.B tc +.RI "[ " OPTIONS " ]" +.RI "[ " FORMAT " ]" +.B qdisc { show | list } [ dev +\fIDEV\fR +.B ] [ root | ingress | handle +\fIQHANDLE\fR +.B | parent +\fICLASSID\fR +.B ] [ invisible ] +.P +.B tc +.RI "[ " OPTIONS " ]" +.RI "[ " FORMAT " ]" +.B class show dev +\fIDEV\fR +.P +.B tc +.RI "[ " OPTIONS " ]" +.B filter show dev +\fIDEV\fR +.P +.B tc +.RI "[ " OPTIONS " ]" +.B filter show block +\fIBLOCK_INDEX\fR +.P +.B tc +.RI "[ " OPTIONS " ]" +.B chain show dev +\fIDEV\fR +.P +.B tc +.RI "[ " OPTIONS " ]" +.B chain show block +\fIBLOCK_INDEX\fR + +.P +.B tc +.RI "[ " OPTIONS " ]" +.B monitor [ file +\fIFILENAME\fR +.B ] + +.P +.ti 8 +.IR OPTIONS " := {" +\fB[ -force ] -b\fR[\fIatch\fR] \fB[ filename ] \fR| +\fB[ \fB-n\fR[\fIetns\fR] name \fB] \fR| +\fB[ \fB-N\fR[\fIumeric\fR] \fB] \fR| +\fB[ \fB-nm \fR| \fB-nam\fR[\fIes\fR] \fB] \fR| +\fB[ \fR{ \fB-cf \fR| \fB-c\fR[\fIonf\fR] \fR} \fB[ filename ] \fB] \fR +\fB[ -t\fR[imestamp\fR] \fB\] \fR| \fB[ -t\fR[short\fR] \fR| \fB[ +-o\fR[neline\fR] \fB]\fR } + +.ti 8 +.IR FORMAT " := {" +\fB\-s\fR[\fItatistics\fR] | +\fB\-d\fR[\fIetails\fR] | +\fB\-r\fR[\fIaw\fR] | +\fB\-i\fR[\fIec\fR] | +\fB\-g\fR[\fIraph\fR] | +\fB\-j\fR[\fIjson\fR] | +\fB\-p\fR[\fIretty\fR] | +\fB\-col\fR[\fIor\fR] } + +.SH DESCRIPTION +.B Tc +is used to configure Traffic Control in the Linux kernel. Traffic Control consists +of the following: + +.TP +SHAPING +When traffic is shaped, its rate of transmission is under control. Shaping may +be more than lowering the available bandwidth - it is also used to smooth out +bursts in traffic for better network behaviour. Shaping occurs on egress. + +.TP +SCHEDULING +By scheduling the transmission of packets it is possible to improve interactivity +for traffic that needs it while still guaranteeing bandwidth to bulk transfers. Reordering +is also called prioritizing, and happens only on egress. + +.TP +POLICING +Whereas shaping deals with transmission of traffic, policing pertains to traffic +arriving. Policing thus occurs on ingress. + +.TP +DROPPING +Traffic exceeding a set bandwidth may also be dropped forthwith, both on +ingress and on egress. + +.P +Processing of traffic is controlled by three kinds of objects: qdiscs, +classes and filters. + +.SH QDISCS +.B qdisc +is short for 'queueing discipline' and it is elementary to +understanding traffic control. Whenever the kernel needs to send a +packet to an interface, it is +.B enqueued +to the qdisc configured for that interface. Immediately afterwards, the kernel +tries to get as many packets as possible from the qdisc, for giving them +to the network adaptor driver. + +A simple QDISC is the 'pfifo' one, which does no processing at all and is a pure +First In, First Out queue. It does however store traffic when the network interface +can't handle it momentarily. + +.SH CLASSES +Some qdiscs can contain classes, which contain further qdiscs - traffic may +then be enqueued in any of the inner qdiscs, which are within the +.B classes. +When the kernel tries to dequeue a packet from such a +.B classful qdisc +it can come from any of the classes. A qdisc may for example prioritize +certain kinds of traffic by trying to dequeue from certain classes +before others. + +.SH FILTERS +A +.B filter +is used by a classful qdisc to determine in which class a packet will +be enqueued. Whenever traffic arrives at a class with subclasses, it needs +to be classified. Various methods may be employed to do so, one of these +are the filters. All filters attached to the class are called, until one of +them returns with a verdict. If no verdict was made, other criteria may be +available. This differs per qdisc. + +It is important to notice that filters reside +.B within +qdiscs - they are not masters of what happens. + +The available filters are: +.TP +basic +Filter packets based on an ematch expression. See +.BR tc-ematch (8) +for details. +.TP +bpf +Filter packets using (e)BPF, see +.BR tc-bpf (8) +for details. +.TP +cgroup +Filter packets based on the control group of their process. See +. BR tc-cgroup (8) +for details. +.TP +flow, flower +Flow-based classifiers, filtering packets based on their flow (identified by selectable keys). See +.BR tc-flow "(8) and" +.BR tc-flower (8) +for details. +.TP +fw +Filter based on fwmark. Directly maps fwmark value to traffic class. See +.BR tc-fw (8). +.TP +route +Filter packets based on routing table. See +.BR tc-route (8) +for details. +.TP +u32 +Generic filtering on arbitrary packet data, assisted by syntax to abstract common operations. See +.BR tc-u32 (8) +for details. +.TP +matchall +Traffic control filter that matches every packet. See +.BR tc-matchall (8) +for details. + +.SH QEVENTS +Qdiscs may invoke user-configured actions when certain interesting events +take place in the qdisc. Each qevent can either be unused, or can have a +block attached to it. To this block are then attached filters using the "tc +block BLOCK_IDX" syntax. The block is executed when the qevent associated +with the attachment point takes place. For example, packet could be +dropped, or delayed, etc., depending on the qdisc and the qevent in +question. + +For example: +.PP +.RS +tc qdisc add dev eth0 root handle 1: red limit 500K avpkt 1K \\ + qevent early_drop block 10 +.RE +.RS +tc filter add block 10 matchall action mirred egress mirror dev eth1 +.RE + +.SH CLASSLESS QDISCS +The classless qdiscs are: +.TP +choke +CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for unresponsive +flows) is a classless qdisc designed to both identify and penalize flows that +monopolize the queue. CHOKe is a variation of RED, and the configuration is +similar to RED. +.TP +codel +CoDel (pronounced "coddle") is an adaptive "no-knobs" active queue management +algorithm (AQM) scheme that was developed to address the shortcomings of +RED and its variants. +.TP +[p|b]fifo +Simplest usable qdisc, pure First In, First Out behaviour. Limited in +packets or in bytes. +.TP +fq +Fair Queue Scheduler realises TCP pacing and scales to millions of concurrent +flows per qdisc. +.TP +fq_codel +Fair Queuing Controlled Delay is queuing discipline that combines Fair +Queuing with the CoDel AQM scheme. FQ_Codel uses a stochastic model to classify +incoming packets into different flows and is used to provide a fair share of the +bandwidth to all the flows using the queue. Each such flow is managed by the +CoDel queuing discipline. Reordering within a flow is avoided since Codel +internally uses a FIFO queue. +.TP +fq_pie +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. +.TP +gred +Generalized Random Early Detection combines multiple RED queues in order to +achieve multiple drop priorities. This is required to realize Assured +Forwarding (RFC 2597). +.TP +hhf +Heavy-Hitter Filter differentiates between small flows and the opposite, +heavy-hitters. The goal is to catch the heavy-hitters and move them to a +separate queue with less priority so that bulk traffic does not affect the +latency of critical traffic. +.TP +ingress +This is a special qdisc as it applies to incoming traffic on an interface, allowing for it to be filtered and policed. +.TP +mqprio +The Multiqueue Priority Qdisc is a simple queuing discipline that allows +mapping traffic flows to hardware queue ranges using priorities and a +configurable priority to traffic class mapping. A traffic class in this context +is a set of contiguous qdisc classes which map 1:1 to a set of hardware exposed +queues. +.TP +multiq +Multiqueue is a qdisc optimized for devices with multiple Tx queues. It has +been added for hardware that wishes to avoid head-of-line blocking. It will +cycle though the bands and verify that the hardware queue associated with the +band is not stopped prior to dequeuing a packet. +.TP +netem +Network Emulator is an enhancement of the Linux traffic control facilities that +allow one to add delay, packet loss, duplication and more other characteristics to +packets outgoing from a selected network interface. +.TP +pfifo_fast +Standard qdisc for 'Advanced Router' enabled kernels. Consists of a three-band +queue which honors Type of Service flags, as well as the priority that may be +assigned to a packet. +.TP +pie +Proportional Integral controller-Enhanced (PIE) is a control theoretic active +queue management scheme. It is based on the proportional integral controller but +aims to control delay. +.TP +red +Random Early Detection simulates physical congestion by randomly dropping +packets when nearing configured bandwidth allocation. Well suited to very +large bandwidth applications. +.TP +sfb +Stochastic Fair Blue is a classless qdisc to manage congestion based on +packet loss and link utilization history while trying to prevent +non-responsive flows (i.e. flows that do not react to congestion marking +or dropped packets) from impacting performance of responsive flows. +Unlike RED, where the marking probability has to be configured, BLUE +tries to determine the ideal marking probability automatically. +.TP +sfq +Stochastic Fairness Queueing reorders queued traffic so each 'session' +gets to send a packet in turn. +.TP +tbf +The Token Bucket Filter is suited for slowing traffic down to a precisely +configured rate. Scales well to large bandwidths. +.SH CONFIGURING CLASSLESS QDISCS +In the absence of classful qdiscs, classless qdiscs can only be attached at +the root of a device. Full syntax: +.P +.B tc qdisc add dev +\fIDEV\fR +.B root +QDISC QDISC-PARAMETERS + +To remove, issue +.P +.B tc qdisc del dev +\fIDEV\fR +.B root + +The +.B pfifo_fast +qdisc is the automatic default in the absence of a configured qdisc. + +.SH CLASSFUL QDISCS +The classful qdiscs are: +.TP +ATM +Map flows to virtual circuits of an underlying asynchronous transfer mode +device. +.TP +DRR +The Deficit Round Robin Scheduler is a more flexible replacement for Stochastic +Fairness Queuing. Unlike SFQ, there are no built-in queues \-\- you need to add +classes and then set up filters to classify packets accordingly. This can be +useful e.g. for using RED qdiscs with different settings for particular +traffic. There is no default class \-\- if a packet cannot be classified, it is +dropped. +.TP +ETS +The ETS qdisc is a queuing discipline that merges functionality of PRIO and DRR +qdiscs in one scheduler. ETS makes it easy to configure a set of strict and +bandwidth-sharing bands to implement the transmission selection described in +802.1Qaz. +.TP +HFSC +Hierarchical Fair Service Curve guarantees precise bandwidth and delay allocation for leaf classes and allocates excess bandwidth fairly. Unlike HTB, it makes use of packet dropping to achieve low delays which interactive sessions benefit from. +.TP +HTB +The Hierarchy Token Bucket implements a rich linksharing hierarchy of +classes with an emphasis on conforming to existing practices. HTB facilitates +guaranteeing bandwidth to classes, while also allowing specification of upper +limits to inter-class sharing. It contains shaping elements, based on TBF and +can prioritize classes. +.TP +PRIO +The PRIO qdisc is a non-shaping container for a configurable number of +classes which are dequeued in order. This allows for easy prioritization +of traffic, where lower classes are only able to send if higher ones have +no packets available. To facilitate configuration, Type Of Service bits are +honored by default. +.TP +QFQ +Quick Fair Queueing is an O(1) scheduler that provides near-optimal guarantees, +and is the first to achieve that goal with a constant cost also with respect to +the number of groups and the packet length. The QFQ algorithm has no loops, and +uses very simple instructions and data structures that lend themselves very +well to a hardware implementation. +.SH THEORY OF OPERATION +Classes form a tree, where each class has a single parent. +A class may have multiple children. Some qdiscs allow for runtime addition +of classes (HTB) while others (PRIO) are created with a static number of +children. + +Qdiscs which allow dynamic addition of classes can have zero or more +subclasses to which traffic may be enqueued. + +Furthermore, each class contains a +.B leaf qdisc +which by default has +.B pfifo +behaviour, although another qdisc can be attached in place. This qdisc may again +contain classes, but each class can have only one leaf qdisc. + +When a packet enters a classful qdisc it can be +.B classified +to one of the classes within. Three criteria are available, although not all +qdiscs will use all three: +.TP +tc filters +If tc filters are attached to a class, they are consulted first +for relevant instructions. Filters can match on all fields of a packet header, +as well as on the firewall mark applied by iptables. +.TP +Type of Service +Some qdiscs have built in rules for classifying packets based on the TOS field. +.TP +skb->priority +Userspace programs can encode a \fIclass-id\fR in the 'skb->priority' field using +the SO_PRIORITY option. +.P +Each node within the tree can have its own filters but higher level filters +may also point directly to lower classes. + +If classification did not succeed, packets are enqueued to the leaf qdisc +attached to that class. Check qdisc specific manpages for details, however. + +.SH NAMING +All qdiscs, classes and filters have IDs, which can either be specified +or be automatically assigned. + +IDs consist of a +.BR major " number and a " minor +number, separated by a colon - +.BR major ":" minor "." +Both +.BR major " and " minor +are hexadecimal numbers and are limited to 16 bits. There are two special +values: root is signified by +.BR major " and " minor +of all ones, and unspecified is all zeros. + +.TP +QDISCS +A qdisc, which potentially can have children, gets assigned a +.B major +number, called a 'handle', leaving the +.B minor +number namespace available for classes. The handle is expressed as '10:'. +It is customary to explicitly assign a handle to qdiscs expected to have children. + +.TP +CLASSES +Classes residing under a qdisc share their qdisc +.B major +number, but each have a separate +.B minor +number called a 'classid' that has no relation to their +parent classes, only to their parent qdisc. The same naming custom as for +qdiscs applies. + +.TP +FILTERS +Filters have a three part ID, which is only needed when using a hashed +filter hierarchy. + +.SH PARAMETERS +The following parameters are widely used in TC. For other parameters, +see the man pages for individual qdiscs. + +.TP +RATES +Bandwidths or rates. +These parameters accept a floating point number, possibly followed by +either a unit (both SI and IEC units supported), or a float followed by a '%' +character to specify the rate as a percentage of the device's speed +(e.g. 5%, 99.5%). Warning: specifying the rate as a percentage means a fraction +of the current speed; if the speed changes, the value will not be recalculated. +.RS +.TP +bit or a bare number +Bits per second +.TP +kbit +Kilobits per second +.TP +mbit +Megabits per second +.TP +gbit +Gigabits per second +.TP +tbit +Terabits per second +.TP +bps +Bytes per second +.TP +kbps +Kilobytes per second +.TP +mbps +Megabytes per second +.TP +gbps +Gigabytes per second +.TP +tbps +Terabytes per second + +.P +To specify in IEC units, replace the SI prefix (k-, m-, g-, t-) with +IEC prefix (ki-, mi-, gi- and ti-) respectively. + +.P +TC store rates as a 32-bit unsigned integer in bps internally, +so we can specify a max rate of 4294967295 bps. +.RE + +.TP +TIMES +Length of time. Can be specified as a floating point number +followed by an optional unit: +.RS +.TP +s, sec or secs +Whole seconds +.TP +ms, msec or msecs +Milliseconds +.TP +us, usec, usecs or a bare number +Microseconds. + +.P +TC defined its own time unit (equal to microsecond) and stores +time values as 32-bit unsigned integer, thus we can specify a max time value +of 4294967295 usecs. +.RE + +.TP +SIZES +Amounts of data. Can be specified as a floating point number +followed by an optional unit: +.RS +.TP +b or a bare number +Bytes. +.TP +kbit +Kilobits +.TP +kb or k +Kilobytes +.TP +mbit +Megabits +.TP +mb or m +Megabytes +.TP +gbit +Gigabits +.TP +gb or g +Gigabytes + +.P +TC stores sizes internally as 32-bit unsigned integer in byte, +so we can specify a max size of 4294967295 bytes. +.RE + +.TP +VALUES +Other values without a unit. +These parameters are interpreted as decimal by default, but you can +indicate TC to interpret them as octal and hexadecimal by adding a '0' +or '0x' prefix respectively. + +.SH TC COMMANDS +The following commands are available for qdiscs, classes and filter: +.TP +add +Add a qdisc, class or filter to a node. For all entities, a +.B parent +must be passed, either by passing its ID or by attaching directly to the root of a device. +When creating a qdisc or a filter, it can be named with the +.B handle +parameter. A class is named with the +.B \fBclassid\fR +parameter. + +.TP +delete +A qdisc can be deleted by specifying its handle, which may also be 'root'. All subclasses and their leaf qdiscs +are automatically deleted, as well as any filters attached to them. + +.TP +change +Some entities can be modified 'in place'. Shares the syntax of 'add', with the exception +that the handle cannot be changed and neither can the parent. In other words, +.B +change +cannot move a node. + +.TP +replace +Performs a nearly atomic remove/add on an existing node id. If the node does not exist yet +it is created. + +.TP +get +Displays a single filter given the interface \fIDEV\fR, \fIqdisc-id\fR, +\fIpriority\fR, \fIprotocol\fR and \fIfilter-id\fR. + +.TP +show +Displays all filters attached to the given interface. A valid parent ID must be passed. + +.TP +link +Only available for qdiscs and performs a replace where the node +must exist already. + +.SH MONITOR +The\fB\ tc\fR\ utility can monitor events generated by the kernel such as +adding/deleting qdiscs, filters or actions, or modifying existing ones. + +The following command is available for\fB\ monitor\fR\ : +.TP +\fBfile\fR +If the file option is given, the \fBtc\fR does not listen to kernel events, but opens +the given file and dumps its contents. The file has to be in binary +format and contain netlink messages. + +.SH OPTIONS + +.TP +.BR "\-b", " \-b filename", " \-batch", " \-batch filename" +read commands from provided file or standard input and invoke them. +First failure will cause termination of tc. + +.TP +.BR "\-force" +don't terminate tc on errors in batch mode. +If there were any errors during execution of the commands, the application return code will be non zero. + +.TP +.BR "\-o" , " \-oneline" +output each record on a single line, replacing line feeds +with the +.B '\e' +character. This is convenient when you want to count records +with +.BR wc (1) +or to +.BR grep (1) +the output. + +.TP +.BR "\-n" , " \-net" , " \-netns " <NETNS> +switches +.B tc +to the specified network namespace +.IR NETNS . +Actually it just simplifies executing of: + +.B ip netns exec +.IR NETNS +.B tc +.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +to + +.B tc +.RI "-n[etns] " NETNS " [ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.BR help " }" + +.TP +.BR "\-N" , " \-Numeric" +Print the number of protocol, scope, dsfield, etc directly instead of +converting it to human readable name. + +.TP +.BR "\-cf" , " \-conf " <FILENAME> +specifies path to the config file. This option is used in conjunction with other options (e.g. +.BR -nm ")." + +.TP +.BR "\-t", " \-timestamp" +When\fB\ tc monitor\fR\ runs, print timestamp before the event message in format: + Timestamp: <Day> <Month> <DD> <hh:mm:ss> <YYYY> <usecs> usec + +.TP +.BR "\-ts", " \-tshort" +When\fB\ tc monitor\fR\ runs, prints short timestamp before the event message in format: + [<YYYY>-<MM>-<DD>T<hh:mm:ss>.<ms>] + +.SH FORMAT +The show command has additional formatting options: + +.TP +.BR "\-s" , " \-stats", " \-statistics" +output more statistics about packet usage. + +.TP +.BR "\-d", " \-details" +output more detailed information about rates and cell sizes. + +.TP +.BR "\-r", " \-raw" +output raw hex values for handles. + +.TP +.BR "\-p", " \-pretty" +for u32 filter, decode offset and mask values to equivalent filter commands based on TCP/IP. +In JSON output, add whitespace to improve readability. + +.TP +.BR "\-iec" +print rates in IEC units (ie. 1K = 1024). + +.TP +.BR "\-g", " \-graph" +shows classes as ASCII graph. Prints generic stats info under each class if +.BR "-s" +option was specified. Classes can be filtered only by +.BR "dev" +option. + +.TP +.BR \-c [ color ][ = { always | auto | never } +Configure color output. If parameter is omitted or +.BR always , +color output is enabled regardless of stdout state. If parameter is +.BR auto , +stdout is checked to be a terminal before enabling color output. If parameter is +.BR never , +color output is disabled. If specified multiple times, the last one takes +precedence. This flag is ignored if +.B \-json +is also given. + +.TP +.BR "\-j", " \-json" +Display results in JSON format. + +.TP +.BR "\-nm" , " \-name" +resolve class name from +.B /etc/iproute2/tc_cls +file or from file specified by +.B -cf +option. This file is just a mapping of +.B classid +to class name: + +.RS 10 +# Here is comment +.RE +.RS 10 +1:40 voip # Here is another comment +.RE +.RS 10 +1:50 web +.RE +.RS 10 +1:60 ftp +.RE +.RS 10 +1:2 home +.RE + +.RS +.B tc +will not fail if +.B -nm +was specified without +.B -cf +option but +.B /etc/iproute2/tc_cls +file does not exist, which makes it possible to pass +.B -nm +option for creating +.B tc +alias. +.RE + +.TP +.BR "\-br" , " \-brief" +Print only essential data needed to identify the filter and action (handle, +cookie, etc.) and stats. This option is currently only supported by +.BR "tc filter show " and " tc actions ls " commands. + +.SH "EXAMPLES" +.PP +tc -g class show dev eth0 +.RS 4 +Shows classes as ASCII graph on eth0 interface. +.RE +.PP +tc -g -s class show dev eth0 +.RS 4 +Shows classes as ASCII graph with stats info under each class. +.RE + +.SH HISTORY +.B tc +was written by Alexey N. Kuznetsov and added in Linux 2.2. +.SH SEE ALSO +.BR tc-basic (8), +.BR tc-bfifo (8), +.BR tc-bpf (8), +.BR tc-cake (8), +.BR tc-cgroup (8), +.BR tc-choke (8), +.BR tc-codel (8), +.BR tc-drr (8), +.BR tc-ematch (8), +.BR tc-ets (8), +.BR tc-flow (8), +.BR tc-flower (8), +.BR tc-fq (8), +.BR tc-fq_codel (8), +.BR tc-fq_pie (8), +.BR tc-fw (8), +.BR tc-gact (8), +.BR tc-hfsc (7), +.BR tc-hfsc (8), +.BR tc-htb (8), +.BR tc-mqprio (8), +.BR tc-pfifo (8), +.BR tc-pfifo_fast (8), +.BR tc-pie (8), +.BR tc-red (8), +.BR tc-route (8), +.BR tc-sfb (8), +.BR tc-sfq (8), +.BR tc-stab (8), +.BR tc-tbf (8), +.BR tc-u32 (8) +.br +.RB "User documentation at " http://lartc.org/ ", but please direct bugreports and patches to: " <netdev@vger.kernel.org> + +.SH AUTHOR +Manpage maintained by bert hubert (ahu@ds9a.nl) diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 new file mode 100644 index 0000000..d95b1e1 --- /dev/null +++ b/man/man8/tipc-bearer.8 @@ -0,0 +1,250 @@ +.TH TIPC-BEARER 8 "02 Jun 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-bearer \- show or modify TIPC bearers + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc bearer add media udp name +.IB "NAME " "remoteip " REMOTEIP +.br + +.ti -8 +.B tipc bearer enable +.RB "[ " domain +.IR DOMAIN " ]" +.RB "[ " priority +.IR PRIORITY " ]" +.BR media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.B localip +.IR LOCALIP +.RB "[ " localport +.IR LOCALPORT " ]" +.RB "[ " remoteip +.IR REMOTEIP " ]" +.RB "[ " remoteport +.IR REMOTEPORT " ] }" +.br + +.ti -8 +.B tipc bearer disable media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE " } +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME " }" +.br + +.ti -8 +.B tipc bearer set +.RB "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR WINDOW +.RB "} " media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME " }" +.br + +.ti -8 +.B tipc bearer get +.RB "[ " "priority" " | " tolerance " | " window " ] " media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.RB "[ " "localip " "| " "localport " "| " "remoteip " "| " "remoteport " "] }" +.br + +.ti -8 +.B tipc bearer list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc bearer --help +will show bearer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Bearer identification +.TP +.BI "media " MEDIA +.br +Specifies the TIPC media type for a particular bearer to operate on. +Different media types have different ways of identifying a unique bearer. +For example, +.BR "ib " "and " eth +identify a bearer with a +.I DEVICE +while +.B udp +identify a bearer with a +.IR "LOCALIP " "and a " NAME + +.B ib +- Infiniband +.sp +.B eth +- Ethernet +.sp +.B udp +- User Datagram Protocol (UDP) +.sp + +.TP +.BI "name " NAME +.br +Logical bearer identifier valid for bearers on +.B udp +media. + +.TP +.BI "device " DEVICE +.br +Physical bearer device valid for bearers on +.B eth +and +.B ib +media. + +.SS Bearer properties + +.TP +.B domain +.br +The addressing domain (region) in which a bearer will establish links and accept +link establish requests. + +.TP +.B priority +.br +Default link priority inherited by all links subsequently established over a +bearer. A single bearer can only host one link to a particular node. This means +the default link priority for a bearer typically affects which bearer to use +when communicating with a particular node in an multi bearer setup. For more +info about link priority see +.BR tipc-link (8) + +.TP +.B tolerance +.br +Default link tolerance inherited by all links subsequently established over a +bearer. For more info about link tolerance see +.BR tipc-link (8) + +.TP +.B window +.br +Default link window inherited by all links subsequently established over a +bearer. For more info about the link window size see +.BR tipc-link (8) + +.SS UDP bearer options + +.TP +.BI "localip " LOCALIP +.br +Specify a local IP v4/v6 address for a +.B udp +bearer. + +.TP +.BI "localport " LOCALPORT +.br +Specify the local port for a +.B udp +bearer. The default port 6118 is used if no port is specified. + +.TP +.BI "remoteip " REMOTEIP +.br +Specify a remote IP for a +.B udp +bearer. If no remote IP is specified a +.B udp +bearer runs in multicast mode and tries to auto-discover its neighbours. +The multicast IP address is generated based on the TIPC network ID. If a remote +IP is specified the +.B udp +bearer runs in point-to-point mode. + +Multiple +.B remoteip +addresses can be added via the +.B bearer add +command. Adding one or more unicast +.B remoteip +addresses to an existing +.B udp +bearer puts the bearer in replicast mode where IP +multicast is emulated by sending multiple unicast messages to each configured +.B remoteip. +When a peer sees a TIPC discovery message from an unknown peer the peer address +is automatically added to the +.B remoteip +(replicast) list, thus only one side of +a link needs to be manually configured. A +.B remoteip +address cannot be added to a multicast bearer. + +.TP +.BI "remoteport " REMOTEPORT +.br +Specify the remote port for a +.B udp +bearer. The default port 6118 is used if no port is specified. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 new file mode 100644 index 0000000..47dae25 --- /dev/null +++ b/man/man8/tipc-link.8 @@ -0,0 +1,383 @@ +.TH TIPC-LINK 8 "22 Mar 2019" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-link \- show links or modify link properties + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 + +.ti -8 +.B tipc link set +.br +.RB "[ " "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR "WINDOW " } +.BI "link " LINK " ]" +.RB "|" +.br +.RB "[ " +.RB "{ " broadcast " [ " +.IR BROADCAST +.RB " | " +.IR REPLICAST +.RB " | " +.IR AUTOSELECT +.RB "[ " ratio +.IR SIZE +.RB "] " ] " } " "]" + +.ti -8 +.B tipc link get +.br +.RB "[ " "{ " "priority" " | " tolerance " | " window " } " link +.IR LINK " ] " +.RB "|" +.br +.RB "[ " { " broadcast " } " ]" +.br + +.ti -8 +.B tipc link statistics +.RB "{ " "show " "[ " link +.I LINK +.RB "] | " "reset +.BI "link " "LINK " +} + +.ti -8 +.B tipc link list +.br + +.ti -8 +.B tipc link monitor set +.RB "{ " "threshold" " } " + +.ti -8 +.B tipc link monitor get +.RB "{ " "threshold" " } " + +.ti -8 +.B tipc link monitor summary +.br + +.ti -8 +.B tipc link monitor list +.br +.RB "[ " "media " " { " eth " | " ib " } " device +.IR "DEVICE" " ]" +.RB "|" +.br +.RB "[ " "media udp name" +.IR NAME " ]" +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc link --help +will show link help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.TP +.BR "\-j", " \-json" +Output results in JavaScript Object Notation (JSON). + +.TP +.BR "\-p", " \-pretty" +The default JSON format is compact and more efficient to parse but hard for most users to read. +This flag adds indentation for readability. + +.SH DESCRIPTION + +.SS Link statistics + +.TP +.BR "ACTIVE " "link state" +.br +An +.B ACTIVE +link is serving traffic. Two links to the same node can become +.B ACTIVE +if they have the same link +.BR priority . +If there is more than two links with the same priority the additional links will +be put in +.B STANDBY +state. + +.TP +.BR "STANDBY " "link state" +.br +A +.B STANDBY +link has lower link priority than an +.B ACTIVE +link. A +.B STANDBY +link has control traffic flowing and is ready to take over should the +.B ACTIVE +link(s) go down. + +.TP +.B MTU +.br +The Maximum Transmission Unit. The two endpoints advertise their default or +configured +.B MTU +at initial link setup and will agree to use the lower of the two values should +they differ. + +.TP +.B Packets +.br +The total amount of transmitted or received TIPC packets on a link. Including +.BR "fragmented " "and " "bundled " packets. + +.TP +.B Fragments +.br +Represented in the form +.BR fragments / fragmented . +Where +.B fragmented +is the amount of data messages which have been broken into +.BR fragments . +Subsequently the +.B fragments +are the total amount of packets that the +.B fragmented +messages has been broken into. + +.TP +.B Bundles +.br +Represented in the form +.BR bundles / bundled . +If a link becomes congested the link will attempt to bundle data from small +.B bundled +packets into +.B bundles +of full MTU size packets before they are transmitted. + +.TP +.B Profile +.br +Shows the +.B average +packet size in octets/bytes for a +.B sample +of packets. It also shows the packet size distribution of the +.B sampled +packets in the intervals + +0-64 bytes +.br +64-256 bytes +.br +256-1024 bytes +.br +1024-4096 bytes +.br +4096-16384 bytes +.br +16384-32768 bytes +.br +32768-66000 bytes + +.TP +.B Message counters + +.B states +- Number of link state messages +.sp + +.B probes +- Link state messages with probe flag set. Typically sent when a link is idle +.sp + +.B nacks +- Number of negative acknowledgement (NACK) packets sent and received by the +link +.sp + +.B defs +- Number of packets received out of order +.sp + +.B dups +- Number of duplicate packets received + +.TP +.B Congestion link +The number of times an application has tried to send data when the TIPC link +was congested + +.TP +.B Send queue +.B Max +is the maximum amount of messages that has resided in the out queue during the +statistics collection period of a link. + +.B Avg +is the average outqueue size during the lifetime of a link. + +.SS Link properties + +.TP +.B priority +.br +The priority between logical TIPC links to a particular node. Link priority can +range from 0 (lowest) to 31 (highest). + +.TP +.B tolerance +.br +Link tolerance specifies the maximum time in milliseconds that TIPC will allow +a communication problem to exist before taking the link down. The default value +is 1500 milliseconds. + +.TP +.B window +.br +The link window controls how many unacknowledged messages a link endpoint can +have in its transmit queue before TIPC's congestion control mechanism is +activated. + +.SS Monitor properties + +.TP +.B threshold +.br +The threshold specifies the cluster size exceeding which the link monitoring +algorithm will switch from "full-mesh" to "overlapping-ring". +If set of 0 the overlapping-ring monitoring is always on and if set to a +value larger than anticipated cluster size the overlapping-ring is disabled. +The default value is 32. + +.SS Monitor information + +.TP +.B table_generation +.br +Represents the event count in a node's local monitoring list. It steps every +time something changes in the local monitor list, including changes in the +local domain. + +.TP +.B cluster_size +.br +Represents the current count of cluster members. + +.TP +.B algorithm +.br +The current supervision algorithm used for neighbour monitoring for the bearer. +Possible values are full-mesh or overlapping-ring. + +.TP +.B status +.br +The node status derived by the local node. +Possible status are up or down. + +.TP +.B monitored +.br +Represent the type of monitoring chosen by the local node. +Possible values are direct or indirect. + +.TP +.B generation +.br +Represents the domain generation which is the event count in a node's local +domain. Every time something changes (peer add/remove/up/down) the domain +generation is stepped and a new version of node record is sent to inform +the neighbors about this change. The domain generation helps the receiver +of a domain record to know if it should ignore or process the record. + +.TP +.B applied_node_status +.br +The node status reported by the peer node for the succeeding peers in +the node list. The Node list is a circular list of ascending addresses +starting with the local node. +Possible status are: U or D. The status U implies up and D down. + +.TP +.B [non_applied_node:status] +.br +Represents the nodes and their status as reported by the peer node. +These nodes were not applied to the monitoring list for this peer node. +They are usually transient and occur during the cluster startup phase +or network reconfiguration. +Possible status are: U or D. The status U implies up and D down. + +.SS Broadcast properties +.TP +.B BROADCAST +.br +Forces all multicast traffic to be transmitted via broadcast only, +irrespective of cluster size and number of destinations. + +.TP +.B REPLICAST +.br +Forces all multicast traffic to be transmitted via replicast only, +irrespective of cluster size and number of destinations. + +.TP +.B AUTOSELECT +.br +Auto switching to broadcast or replicast depending on cluster size and +destination node number. + +.TP +.B ratio SIZE +.br +Set the AUTOSELECT criteria, percentage of destination nodes vs cluster +size. + +.SH EXAMPLES +.PP +tipc link monitor list +.RS 4 +Shows the link monitoring information for cluster members on device data0. +.RE +.PP +tipc link monitor summary +.RS 4 +The monitor summary command prints the basic attributes. +.RE + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-media (8), +.BR tipc-bearer (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 new file mode 100644 index 0000000..4689cb3 --- /dev/null +++ b/man/man8/tipc-media.8 @@ -0,0 +1,87 @@ +.TH TIPC-MEDIA 8 "02 Jun 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-media \- list or modify media properties + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 + +.ti -8 +.B tipc media set +.RB "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR "WINDOW " } +.BI "media " MEDIA + +.ti -8 +.B tipc media get +.RB "{ " "priority" " | " tolerance " | " window " } " media +.I MEDIA + +.ti -8 +.B tipc media list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc media --help +will show media help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Media properties + +.TP +.B priority +.br +Default link priority inherited by all bearers subsequently enabled on a +media. For more info about link priority see +.BR tipc-link (8) + +.TP +.B tolerance +.br +Default link tolerance inherited by all bearers subsequently enabled on a +media. For more info about link tolerance see +.BR tipc-link (8) + +.TP +.B window +.br +Default link window inherited by all bearers subsequently enabled on a +media. For more info about link window see +.BR tipc-link (8) + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 new file mode 100644 index 0000000..f7c51f1 --- /dev/null +++ b/man/man8/tipc-nametable.8 @@ -0,0 +1,110 @@ +.TH TIPC-NAMETABLE 8 "02 Jun 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-nametable \- show TIPC nametable + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc nametable show +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" + +.TP +.BR "\-j", " \-json" +Output results in JavaScript Object Notation (JSON). + +.TP +.BR "\-p", " \-pretty" +The default JSON format is compact and more efficient to parse but hard for most users to read. +This flag adds indentation for readability. + +Show help about last valid command. For example +.B tipc nametable --help +will show nametable help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.SH DESCRIPTION +The nametable shows TIPC publication information. + +.SS Nametable format + +.TP +.B Type +.br +The 32-bit type field of the port name. The type field often indicates the class of service +provided by a port. + +.TP +.B Lower +.br +The lower bound of the 32-bit instance field of the port name. +The instance field is often used as a sub-class indicator. + +.TP +.B Upper +.br +The upper bound of the 32-bit instance field of the port name. +The instance field is often used as a sub-class indicator. +A difference in +.BR "lower " "and " upper +means the socket is bound to the port name range [lower,upper] + +.TP +.B Port Identity +.br +The unique socket (port) identifier within the TIPC cluster. The +.B port identity +consists of a node identity followed by a socket reference number. + +.TP +.B Publication +.br +The +.B publication +ID is a random number used internally to represent a publication. + +.TP +.B Scope +.br +The publication +.B scope +specifies the visibility of a bound port name. +The +.B scope +can be specified to comprise three different domains: +.BR node ", " "cluster " "and " zone. +Applications residing within the specified +.B scope +can see and access the port using the displayed port name. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-node (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 new file mode 100644 index 0000000..a72a409 --- /dev/null +++ b/man/man8/tipc-node.8 @@ -0,0 +1,72 @@ +.TH TIPC-NODE 8 "02 Jun 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-node \- modify and show local node parameters or list peer nodes + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc node set +.RB "{ " "address " +.IR ADDRESS +.RB "| " netid +.IR NETID +.RB "} " + +.ti -8 +.B tipc node get +.RB "{ " "address" " | " netid " } " + +.ti -8 +.B tipc node list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc node --help +will show node help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Node parameters +.TP +.BI address +.br +The TIPC logical address. On the form x.y.z where x, y and z are unsigned +integers. + +.TP +.BI netid +.br +Network identity. Can by used to create individual TIPC clusters on the same +media. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-peer.8 b/man/man8/tipc-peer.8 new file mode 100644 index 0000000..430651f --- /dev/null +++ b/man/man8/tipc-peer.8 @@ -0,0 +1,52 @@ +.TH TIPC-PEER 8 "04 Dec 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-peer \- modify peer information + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc peer remove address +.IR ADDRESS + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc peer --help +will show peer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Peer remove +Remove an offline peer node from the local data structures. The peer is +identified by its +.B address + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc-socket.8 b/man/man8/tipc-socket.8 new file mode 100644 index 0000000..23ec1e5 --- /dev/null +++ b/man/man8/tipc-socket.8 @@ -0,0 +1,59 @@ +.TH TIPC-SOCKET 8 "02 Jun 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-socket \- show TIPC socket (port) information + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc socket list + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc socket --help +will show socket help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.SH DESCRIPTION +A TIPC socket is represented by an unsigned integer. + +.TP +.SS Bound state +A bound socket has a logical TIPC port name associated with it. + +.TP +.SS Connected state +A connected socket is directly connected to another socket creating a point +to point connection between TIPC sockets. If the connection to X was made using +a logical port name Y that name will show up as +.BR "connected to " "X " "via " Y +. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8) +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/tipc.8 b/man/man8/tipc.8 new file mode 100644 index 0000000..6706cca --- /dev/null +++ b/man/man8/tipc.8 @@ -0,0 +1,109 @@ +.TH TIPC 8 "02 Jun 2015" "iproute2" "Linux" +.SH NAME +tipc \- a TIPC configuration and management tool +.SH SYNOPSIS + +.ad l +.in +8 +.ti -8 +.B tipc +.RI "[ " OPTIONS " ] " COMMAND " " ARGUMENTS " +.sp + +.ti -8 +.IR COMMAND " := { " +.BR bearer " | " link " | " media " | " nametable " | " node " | " socket " } +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-h\fR[\fIhelp\fR] } + +.SH DESCRIPTION +The Transparent Inter-Process Communication (TIPC) protocol offers total address +transparency between processes which allows applications in a clustered computer +environment to communicate quickly and reliably with each other, regardless of +their location within the cluster. + +TIPC originated at the telecommunications manufacturer Ericsson. The first open +source version of TIPC was created in 2000 when Ericsson released its first +Linux version of TIPC. TIPC was introduced in the mainline Linux kernel in 2006 +and is now widely used both within and outside of Ericsson. + +.SH OPTIONS + +.TP +.BR "\-h" , " --help" +Show help about last given command. For example +.B tipc bearer --help +will show bearer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.TP +.BR "\-j", " \-json" +Output results in JavaScript Object Notation (JSON). + +.TP +.BR "\-p", " \-pretty" +The default JSON format is compact and more efficient to parse but hard for most users to read. +This flag adds indentation for readability. + +.SH COMMANDS + +.TP +.B BEARER +- Show or modify TIPC bearers + +.TP +.B LINK +- Show or modify TIPC links + +.TP +.B MEDIA +- Show or modify TIPC media + +.TP +.B NAMETABLE +- Show TIPC nametable + +.TP +.B NODE +- Show or modify TIPC node parameters + +.TP +.B SOCKET +- Show TIPC sockets + +.SH ARGUMENTS + +Command arguments are described in a command specific man page and typically +consists of nested commands along with key value pairs. +If no arguments are given a command typically shows its help text. The explicit +help option +.B -h +or +.B --help +can occur anywhere among the arguments and will show help for the last valid +command given. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-peer (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe <richard.alpe@ericsson.com> diff --git a/man/man8/vdpa-dev.8 b/man/man8/vdpa-dev.8 new file mode 100644 index 0000000..43e5bf4 --- /dev/null +++ b/man/man8/vdpa-dev.8 @@ -0,0 +1,181 @@ +.TH DEVLINK\-DEV 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa-dev \- vdpa device configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.B dev +.RI "[ " OPTIONS " ] " +.RI " { " COMMAND | " " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] +} + +.ti -8 +.B vdpa dev show +.RI "[ " DEV " ]" + +.ti -8 +.B vdpa dev help + +.ti -8 +.B vdpa dev add +.B name +.I NAME +.B mgmtdev +.I MGMTDEV +.RI "[ device_features " DEVICE_FEATURES " ]" +.RI "[ mac " MACADDR " ]" +.RI "[ mtu " MTU " ]" +.RI "[ max_vqp " MAX_VQ_PAIRS " ]" + +.ti -8 +.B vdpa dev del +.I DEV + +.ti -8 +.B vdpa dev config show +.RI "[ " DEV " ]" + +.ti -8 +.B vdpa dev vstats show +.I DEV +.B qidx +.I QUEUE_INDEX + + +.SH "DESCRIPTION" +.SS vdpa dev show - display vdpa device attributes + +.PP +.I "DEV" +- specifies the vdpa device to show. +If this argument is omitted all devices are listed. + +.in +4 +Format is: +.in +2 +VDPA_DEVICE_NAME + +.SS vdpa dev add - add a new vdpa device. + +.TP +.BI name " NAME" +Name of the new vdpa device to add. + +.TP +.BI mgmtdev " MGMTDEV" +Name of the management device to use for device addition. + +.PP +.BI device_features " DEVICE_FEATURES" +Specifies the virtio device features bit-mask that is provisioned for the new vdpa device. + +The bits can be found under include/uapi/linux/virtio*h. + +see macros such as VIRTIO_F_ and VIRTIO_XXX(e.g NET)_F_ for specific bit values. + +This is optional. + +.BI mac " MACADDR" +- specifies the mac address for the new vdpa device. +This is applicable only for the network type of vdpa device. This is optional. + +.BI mtu " MTU" +- specifies the mtu for the new vdpa device. +This is applicable only for the network type of vdpa device. This is optional. + +.SS vdpa dev del - Delete the vdpa device. + +.PP +.I "DEV" +- specifies the vdpa device to delete. + +.SS vdpa dev config show - Show configuration of specific device or all devices. + +.PP +.I "DEV" +- specifies the vdpa device to show its configuration. +If this argument is omitted all devices configuration is listed. + +.in +4 +Format is: +.in +2 +VDPA_DEVICE_NAME + +.SS vdpa dev vstats show - shows vendor specific statistics for the given device and virtqueue index. The information is presented as name-value pairs where name is the name of the field and value is a numeric value for it. + +.TP +.BI "DEV" +- specifies the vdpa device to query + +.TP +.BI qidx " QUEUE_INDEX" +- specifies the virtqueue index to query + +.SH "EXAMPLES" +.PP +vdpa dev show +.RS 4 +Shows the all vdpa devices on the system. +.RE +.PP +vdpa dev show foo +.RS 4 +Shows the specified vdpa device. +.RE +.PP +vdpa dev add name foo mgmtdev vdpa_sim_net +.RS 4 +Add the vdpa device named foo on the management device vdpa_sim_net. +.RE +.PP +vdpa dev add name foo mgmtdev vdpa_sim_net device_features 0x300020000 +.RS 4 +Add the vdpa device named foo on the management device vdpa_sim_net with device_features of 0x300020000 +.RE +.PP +vdpa dev add name foo mgmtdev vdpa_sim_net mac 00:11:22:33:44:55 +.RS 4 +Add the vdpa device named foo on the management device vdpa_sim_net with mac address of 00:11:22:33:44:55. +.RE +.PP +vdpa dev add name foo mgmtdev vdpa_sim_net mac 00:11:22:33:44:55 mtu 9000 +.RS 4 +Add the vdpa device named foo on the management device vdpa_sim_net with mac address of 00:11:22:33:44:55 and mtu of 9000 bytes. +.RE +.PP +vdpa dev add name foo mgmtdev auxiliary/mlx5_core.sf.1 mac 00:11:22:33:44:55 max_vqp 8 +.RS 4 +Add the vdpa device named foo on the management device auxiliary/mlx5_core.sf.1 with mac address of 00:11:22:33:44:55 and max 8 virtqueue pairs +.RE +.PP +vdpa dev del foo +.RS 4 +Delete the vdpa device named foo which was previously created. +.RE +.PP +vdpa dev config show foo +.RS 4 +Shows the vdpa device configuration of device named foo. +.RE +.PP +vdpa dev vstats show vdpa0 qidx 1 +.RS 4 +Shows vendor specific statistics information for vdpa device vdpa0 and virtqueue index 1 +.RE + +.SH SEE ALSO +.BR vdpa (8), +.BR vdpa-mgmtdev (8), +.br + +.SH AUTHOR +Parav Pandit <parav@nvidia.com> diff --git a/man/man8/vdpa-mgmtdev.8 b/man/man8/vdpa-mgmtdev.8 new file mode 100644 index 0000000..cae2cbd --- /dev/null +++ b/man/man8/vdpa-mgmtdev.8 @@ -0,0 +1,53 @@ +.TH DEVLINK\-DEV 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa-dev \- vdpa management device view +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.B mgmtdev +.RI " { " COMMAND | " " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] +} + +.ti -8 +.B vdpa mgmtdev show +.RI "[ " MGMTDEV " ]" + +.ti -8 +.B vdpa mgmtdev help + +.SH "DESCRIPTION" +.SS vdpa mgmtdev show - display vdpa management device attributes + +.PP +.I "MGMTDEV" +- specifies the vdpa management device to show. +If this argument is omitted all management devices are listed. + +.SH "EXAMPLES" +.PP +vdpa mgmtdev show +.RS 4 +Shows all the vdpa management devices on the system. +.RE +.PP +vdpa mgmtdev show bar +.RS 4 +Shows the specified vdpa management device. +.RE + +.SH SEE ALSO +.BR vdpa (8), +.BR vdpa-dev (8), +.br + +.SH AUTHOR +Parav Pandit <parav@nvidia.com> diff --git a/man/man8/vdpa.8 b/man/man8/vdpa.8 new file mode 100644 index 0000000..d1aaece --- /dev/null +++ b/man/man8/vdpa.8 @@ -0,0 +1,76 @@ +.TH VDPA 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa \- vdpa management tool +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.RI "[ " OPTIONS " ] { " dev | mgmtdev " } { " COMMAND " | " +.BR help " }" +.sp + +.SH OPTIONS + +.TP +.BR "\-V" , " --Version" +Print the version of the +.B vdpa +utility and exit. + +.TP +.BR "\-j" , " --json" +Generate JSON output. + +.TP +.BR "\-p" , " --pretty" +When combined with -j generate a pretty JSON output. + +.SS +.I OBJECT + +.TP +.B dev +- vdpa device. + +.TP +.B mgmtdev +- vdpa management device. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +It is possible to +.B show +(or +.B list +) objects. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B show +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR vdpa-dev (8), +.BR vdpa-mgmtdev (8), +.br + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B <netdev@vger.kernel.org> +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Parav Pandit <parav@nvidia.com> |