diff options
Diffstat (limited to 'sys-utils')
101 files changed, 39192 insertions, 0 deletions
diff --git a/sys-utils/Makemodule.am b/sys-utils/Makemodule.am new file mode 100644 index 0000000..825a733 --- /dev/null +++ b/sys-utils/Makemodule.am @@ -0,0 +1,470 @@ +if BUILD_LSMEM +usrbin_exec_PROGRAMS += lsmem +dist_man_MANS += sys-utils/lsmem.1 +lsmem_SOURCES = sys-utils/lsmem.c +lsmem_LDADD = $(LDADD) libcommon.la libsmartcols.la +lsmem_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + +if BUILD_CHMEM +usrbin_exec_PROGRAMS += chmem +dist_man_MANS += sys-utils/chmem.8 +chmem_SOURCES = sys-utils/chmem.c +chmem_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_FLOCK +usrbin_exec_PROGRAMS += flock +dist_man_MANS += sys-utils/flock.1 +flock_SOURCES = sys-utils/flock.c lib/monotonic.c lib/timer.c +flock_LDADD = $(LDADD) libcommon.la $(REALTIME_LIBS) +endif + +if BUILD_CHOOM +usrbin_exec_PROGRAMS += choom +dist_man_MANS += sys-utils/choom.1 +choom_SOURCES = sys-utils/choom.c +choom_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_IPCMK +usrbin_exec_PROGRAMS += ipcmk +dist_man_MANS += sys-utils/ipcmk.1 +ipcmk_SOURCES = sys-utils/ipcmk.c +ipcmk_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_IPCRM +usrbin_exec_PROGRAMS += ipcrm +dist_man_MANS += sys-utils/ipcrm.1 +ipcrm_SOURCES = sys-utils/ipcrm.c +ipcrm_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_IPCS +usrbin_exec_PROGRAMS += ipcs +dist_man_MANS += sys-utils/ipcs.1 +ipcs_SOURCES = sys-utils/ipcs.c \ + sys-utils/ipcutils.c \ + sys-utils/ipcutils.h +ipcs_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_LSIPC +usrbin_exec_PROGRAMS += lsipc +dist_man_MANS += sys-utils/lsipc.1 +lsipc_SOURCES = sys-utils/lsipc.c \ + sys-utils/ipcutils.c \ + sys-utils/ipcutils.h +lsipc_LDADD = $(LDADD) libcommon.la libsmartcols.la +lsipc_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + +if BUILD_RENICE +usrbin_exec_PROGRAMS += renice +dist_man_MANS += sys-utils/renice.1 +renice_SOURCES = sys-utils/renice.c +endif + +if BUILD_RFKILL +usrsbin_exec_PROGRAMS += rfkill +dist_man_MANS += sys-utils/rfkill.8 +rfkill_SOURCES = sys-utils/rfkill.c +rfkill_LDADD = $(LDADD) libcommon.la libsmartcols.la +rfkill_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + +if BUILD_SETSID +usrbin_exec_PROGRAMS += setsid +dist_man_MANS += sys-utils/setsid.1 +setsid_SOURCES = sys-utils/setsid.c +endif + +if BUILD_READPROFILE +usrsbin_exec_PROGRAMS += readprofile +dist_man_MANS += sys-utils/readprofile.8 +readprofile_SOURCES = sys-utils/readprofile.c +endif + +if BUILD_TUNELP +usrsbin_exec_PROGRAMS += tunelp +dist_man_MANS += sys-utils/tunelp.8 +tunelp_SOURCES = sys-utils/tunelp.c +tunelp_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_FSTRIM +sbin_PROGRAMS += fstrim +dist_man_MANS += sys-utils/fstrim.8 +fstrim_SOURCES = sys-utils/fstrim.c +fstrim_LDADD = $(LDADD) libcommon.la libmount.la +fstrim_CFLAGS = $(AM_CFLAGS) -I$(ul_libmount_incdir) +if HAVE_SYSTEMD +systemdsystemunit_DATA += \ + sys-utils/fstrim.service \ + sys-utils/fstrim.timer +endif +endif # BUILD_FSTRIM + +PATHFILES += sys-utils/fstrim.service +EXTRA_DIST += sys-utils/fstrim.timer + +if BUILD_DMESG +bin_PROGRAMS += dmesg +dist_man_MANS += sys-utils/dmesg.1 +dmesg_SOURCES = sys-utils/dmesg.c lib/monotonic.c +dmesg_LDADD = $(LDADD) libcommon.la libtcolors.la $(REALTIME_LIBS) +dmesg_CFLAGS = $(AM_CFLAGS) +check_PROGRAMS += test_dmesg +test_dmesg_SOURCES = $(dmesg_SOURCES) +test_dmesg_LDADD = $(dmesg_LDADD) +test_dmesg_CFLAGS = -DTEST_DMESG $(dmesg_CFLAGS) +endif + +if BUILD_CTRLALTDEL +sbin_PROGRAMS += ctrlaltdel +dist_man_MANS += sys-utils/ctrlaltdel.8 +ctrlaltdel_SOURCES = sys-utils/ctrlaltdel.c +ctrlaltdel_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_FSFREEZE +sbin_PROGRAMS += fsfreeze +dist_man_MANS += sys-utils/fsfreeze.8 +fsfreeze_SOURCES = sys-utils/fsfreeze.c +endif + +if BUILD_BLKDISCARD +sbin_PROGRAMS += blkdiscard +dist_man_MANS += sys-utils/blkdiscard.8 +blkdiscard_SOURCES = sys-utils/blkdiscard.c lib/monotonic.c +blkdiscard_LDADD = $(LDADD) libcommon.la $(REALTIME_LIBS) +endif + +if BUILD_BLKZONE +sbin_PROGRAMS += blkzone +dist_man_MANS += sys-utils/blkzone.8 +blkzone_SOURCES = sys-utils/blkzone.c +blkzone_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_LDATTACH +usrsbin_exec_PROGRAMS += ldattach +dist_man_MANS += sys-utils/ldattach.8 +ldattach_SOURCES = sys-utils/ldattach.c +ldattach_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_RTCWAKE +usrsbin_exec_PROGRAMS += rtcwake +dist_man_MANS += sys-utils/rtcwake.8 +PATHFILES += sys-utils/rtcwake.8 +rtcwake_SOURCES = sys-utils/rtcwake.c +rtcwake_LDADD = $(LDADD) libcommon.la +endif + +if BUILD_SETARCH +usrbin_exec_PROGRAMS += setarch +dist_man_MANS += sys-utils/setarch.8 +setarch_SOURCES = sys-utils/setarch.c + +SETARCH_LINKS = uname26 linux32 linux64 + +if ARCH_S390 +SETARCH_LINKS += s390 s390x +endif +if ARCH_I86 +SETARCH_LINKS += i386 +endif +if ARCH_86_64 +SETARCH_LINKS += i386 x86_64 +endif +if ARCH_PPC +SETARCH_LINKS += ppc ppc64 ppc32 +endif +if ARCH_SPARC +SETARCH_LINKS += sparc sparc64 sparc32 sparc32bash +endif +if ARCH_MIPS +SETARCH_LINKS += mips mips64 mips32 +endif +if ARCH_IA64 +SETARCH_LINKS += i386 ia64 +endif +if ARCH_HPPA +SETARCH_LINKS += parisc parisc64 parisc32 +endif + +SETARCH_MAN_LINKS = $(addprefix sys-utils/,$(SETARCH_LINKS:=.8)) +man_MANS += $(SETARCH_MAN_LINKS) +CLEANFILES += $(SETARCH_MAN_LINKS) + +$(SETARCH_MAN_LINKS): + $(AM_V_at) $(MKDIR_P) sys-utils + $(AM_V_GEN)echo ".so man8/setarch.8" > $@ + +install-exec-hook-setarch: + for I in $(SETARCH_LINKS); do \ + cd $(DESTDIR)$(usrbin_execdir) && ln -sf setarch $$I ; \ + done + +uninstall-hook-setarch: + for I in $(SETARCH_LINKS); do \ + rm -f $(DESTDIR)$(usrbin_execdir)/$$I ; \ + done + +INSTALL_EXEC_HOOKS += install-exec-hook-setarch +UNINSTALL_HOOKS += uninstall-hook-setarch + +endif # BUILD_SETARCH + + +if BUILD_EJECT +usrbin_exec_PROGRAMS += eject +eject_SOURCES = sys-utils/eject.c lib/monotonic.c +eject_LDADD = $(LDADD) libmount.la libcommon.la $(REALTIME_LIBS) +eject_CFLAGS = $(AM_CFLAGS) -I$(ul_libmount_incdir) +dist_man_MANS += sys-utils/eject.1 +endif + + +if BUILD_LOSETUP +sbin_PROGRAMS += losetup +dist_man_MANS += sys-utils/losetup.8 +losetup_SOURCES = sys-utils/losetup.c +losetup_LDADD = $(LDADD) libcommon.la libsmartcols.la +losetup_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) + +if HAVE_STATIC_LOSETUP +bin_PROGRAMS += losetup.static +losetup_static_SOURCES = $(losetup_SOURCES) +losetup_static_LDFLAGS = -all-static +losetup_static_LDADD = $(losetup_LDADD) +losetup_static_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif +endif # BUILD_LOSETUP + + +if BUILD_ZRAMCTL +sbin_PROGRAMS += zramctl +dist_man_MANS += sys-utils/zramctl.8 +zramctl_SOURCES = sys-utils/zramctl.c +zramctl_LDADD = $(LDADD) libcommon.la libsmartcols.la +zramctl_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + + +if BUILD_PRLIMIT +usrbin_exec_PROGRAMS += prlimit +dist_man_MANS += sys-utils/prlimit.1 +prlimit_SOURCES = sys-utils/prlimit.c +prlimit_LDADD = $(LDADD) libcommon.la libsmartcols.la +prlimit_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + + +if BUILD_LSNS +usrbin_exec_PROGRAMS += lsns +dist_man_MANS += sys-utils/lsns.8 +lsns_SOURCES = sys-utils/lsns.c +lsns_LDADD = $(LDADD) libcommon.la libsmartcols.la libmount.la +lsns_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) -I$(ul_libmount_incdir) +endif + + +if BUILD_MOUNT +bin_PROGRAMS += mount umount +dist_man_MANS += \ + sys-utils/mount.8 \ + sys-utils/fstab.5 \ + sys-utils/umount.8 +mount_SOURCES = sys-utils/mount.c +mount_LDADD = $(LDADD) libcommon.la libmount.la $(SELINUX_LIBS) +mount_CFLAGS = $(SUID_CFLAGS) $(AM_CFLAGS) -I$(ul_libmount_incdir) +mount_LDFLAGS = $(SUID_LDFLAGS) $(AM_LDFLAGS) + +umount_SOURCES = sys-utils/umount.c +umount_LDADD = $(LDADD) libcommon.la libmount.la +umount_CFLAGS = $(AM_CFLAGS) $(SUID_CFLAGS) -I$(ul_libmount_incdir) +umount_LDFLAGS = $(SUID_LDFLAGS) $(AM_LDFLAGS) + +if HAVE_STATIC_MOUNT +bin_PROGRAMS += mount.static +mount_static_SOURCES = $(mount_SOURCES) +mount_static_CFLAGS = $(mount_CFLAGS) +mount_static_LDFLAGS = $(mount_LDFLAGS) -all-static +mount_static_LDADD = $(mount_LDADD) $(SELINUX_LIBS_STATIC) +endif + +if HAVE_STATIC_UMOUNT +bin_PROGRAMS += umount.static +umount_static_SOURCES = $(umount_SOURCES) +umount_static_CFLAGS = $(umount_CFLAGS) +umount_static_LDFLAGS = $(umount_LDFLAGS) -all-static +umount_static_LDADD = $(umount_LDADD) +endif + +install-exec-hook-mount: +if MAKEINSTALL_DO_CHOWN + chown root:root $(DESTDIR)$(bindir)/mount +endif +if MAKEINSTALL_DO_SETUID + chmod 4755 $(DESTDIR)$(bindir)/mount +endif +if MAKEINSTALL_DO_CHOWN + chown root:root $(DESTDIR)$(bindir)/umount +endif +if MAKEINSTALL_DO_SETUID + chmod 4755 $(DESTDIR)$(bindir)/umount +endif + +INSTALL_EXEC_HOOKS += install-exec-hook-mount +endif # BUILD_MOUNT + + +if BUILD_SWAPON +sbin_PROGRAMS += swapon swapoff +dist_man_MANS += \ + sys-utils/swapoff.8 \ + sys-utils/swapon.8 + +swapon_SOURCES = \ + sys-utils/swapon.c \ + sys-utils/swapon-common.c \ + sys-utils/swapon-common.h \ + lib/swapprober.c \ + include/swapprober.h +swapon_CFLAGS = $(AM_CFLAGS) \ + -I$(ul_libblkid_incdir) \ + -I$(ul_libmount_incdir) \ + -I$(ul_libsmartcols_incdir) +swapon_LDADD = $(LDADD) \ + libblkid.la \ + libcommon.la \ + libmount.la \ + libsmartcols.la + +swapoff_SOURCES = \ + sys-utils/swapoff.c \ + sys-utils/swapon-common.c \ + sys-utils/swapon-common.h \ + lib/swapprober.c \ + include/swapprober.h +swapoff_CFLAGS = $(AM_CFLAGS) \ + -I$(ul_libblkid_incdir) \ + -I$(ul_libmount_incdir) +swapoff_LDADD = $(LDADD) \ + libmount.la \ + libblkid.la \ + libcommon.la +endif + +if BUILD_LSCPU +usrbin_exec_PROGRAMS += lscpu +lscpu_SOURCES = \ + sys-utils/lscpu.c \ + sys-utils/lscpu.h \ + sys-utils/lscpu-arm.c \ + sys-utils/lscpu-dmi.c +lscpu_LDADD = $(LDADD) libcommon.la libsmartcols.la $(RTAS_LIBS) +lscpu_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +dist_man_MANS += sys-utils/lscpu.1 +endif + +if BUILD_CHCPU +sbin_PROGRAMS += chcpu +chcpu_SOURCES = sys-utils/chcpu.c +chcpu_LDADD = $(LDADD) libcommon.la +dist_man_MANS += sys-utils/chcpu.8 +endif + +if BUILD_WDCTL +bin_PROGRAMS += wdctl +dist_man_MANS += sys-utils/wdctl.8 +wdctl_SOURCES = sys-utils/wdctl.c +wdctl_LDADD = $(LDADD) libcommon.la libsmartcols.la +wdctl_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir) +endif + +if BUILD_MOUNTPOINT +bin_PROGRAMS += mountpoint +mountpoint_LDADD = $(LDADD) libmount.la +mountpoint_CFLAGS = $(AM_CFLAGS) -I$(ul_libmount_incdir) +dist_man_MANS += sys-utils/mountpoint.1 +mountpoint_SOURCES = sys-utils/mountpoint.c +endif + +if BUILD_FALLOCATE +usrbin_exec_PROGRAMS += fallocate +fallocate_SOURCES = sys-utils/fallocate.c +fallocate_LDADD = $(LDADD) libcommon.la +dist_man_MANS += sys-utils/fallocate.1 +endif + +if BUILD_PIVOT_ROOT +sbin_PROGRAMS += pivot_root +dist_man_MANS += sys-utils/pivot_root.8 +pivot_root_SOURCES = sys-utils/pivot_root.c +endif + +if BUILD_SWITCH_ROOT +sbin_PROGRAMS += switch_root +dist_man_MANS += sys-utils/switch_root.8 +switch_root_SOURCES = sys-utils/switch_root.c +endif + +if BUILD_UNSHARE +usrbin_exec_PROGRAMS += unshare +dist_man_MANS += sys-utils/unshare.1 +unshare_SOURCES = sys-utils/unshare.c +unshare_LDADD = $(LDADD) libcommon.la +unshare_CFLAGS = $(AM_CFLAGS) -I$(ul_libmount_incdir) + +if HAVE_STATIC_UNSHARE +usrbin_exec_PROGRAMS += unshare.static +unshare_static_SOURCES = $(unshare_SOURCES) +unshare_static_LDFLAGS = -all-static +unshare_static_LDADD = $(unshare_LDADD) +unshare_static_CFLAGS = $(unshare_CFLAGS) +endif +endif + +if BUILD_NSENTER +usrbin_exec_PROGRAMS += nsenter +dist_man_MANS += sys-utils/nsenter.1 +nsenter_SOURCES = sys-utils/nsenter.c +nsenter_LDADD = $(LDADD) libcommon.la $(SELINUX_LIBS) + +if HAVE_STATIC_NSENTER +usrbin_exec_PROGRAMS += nsenter.static +nsenter_static_SOURCES = $(nsenter_SOURCES) +nsenter_static_LDFLAGS = -all-static +nsenter_static_LDADD = $(nsenter_LDADD) +endif +endif + +if BUILD_HWCLOCK +sbin_PROGRAMS += hwclock +dist_man_MANS += \ + sys-utils/hwclock.8 \ + sys-utils/adjtime_config.5 +PATHFILES += sys-utils/hwclock.8 +hwclock_SOURCES = \ + sys-utils/hwclock.c \ + sys-utils/hwclock.h \ + sys-utils/hwclock-cmos.c +if LINUX +hwclock_SOURCES += sys-utils/hwclock-rtc.c +endif +hwclock_LDADD = $(LDADD) libcommon.la -lm +if HAVE_AUDIT +hwclock_LDADD += -laudit +endif +endif # BUILD_HWCLOCK + +if BUILD_SETPRIV +usrbin_exec_PROGRAMS += setpriv +dist_man_MANS += sys-utils/setpriv.1 +setpriv_SOURCES = sys-utils/setpriv.c +setpriv_LDADD = $(LDADD) -lcap-ng libcommon.la +endif diff --git a/sys-utils/adjtime_config.5 b/sys-utils/adjtime_config.5 new file mode 100644 index 0000000..6f03ca7 --- /dev/null +++ b/sys-utils/adjtime_config.5 @@ -0,0 +1,64 @@ +.TH ADJTIME_CONFIG 5 "August 2018" "util-linux" "File Formats" +.SH NAME +adjtime \- information about hardware clock setting and drift factor +.SH SYNOPSIS +.I /etc/adjtime +.SH DESCRIPTION +The file +.B /etc/adjtime +contains descriptive information about the hardware mode clock setting and clock drift factor. +The file is read and write by hwclock; and read by programs like rtcwake to get RTC time mode. +.PP +The file is usually located in /etc, but tools like +.BR hwclock (8) +or +.BR rtcwake (8) +allow to use alternative location by command line options if write access to +/etc is unwanted. The default clock mode is "UTC" if the file is missing. +.PP +The Hardware Clock is usually not very accurate. However, much of its inaccuracy is completely predictable - it gains +or loses the same amount of time every day. This is called systematic drift. The util hwclock keeps the file /etc/adjtime, +that keeps some historical information. +For more details see "\fBThe Adjust Function\fR" and "\fBThe Adjtime File\fR" sections from +.BR hwckock (8) +man page. +.PP + +The format of the adjtime file is, in ASCII. +.sp +.SS First line +Three numbers, separated by blanks: +.TP +.B "drift factor" +the systematic drift rate in seconds per day (floating point decimal) +.TP +.B last adjust time +the resulting number of seconds since 1969 UTC of most recent adjustment or calibration (decimal integer) +.TP +.B "adjustment status" +zero (for compatibility with clock(8)) as a decimal integer. + +.SS Second line +.TP +.B "last calibration time" +The resulting number of seconds since 1969 UTC of most recent calibration. +Zero if there has been no calibration yet or it is known that any previous +calibration is moot (for example, because the Hardware Clock has been found, +since that calibration, not to contain a valid time). This is a decimal +integer. + +.SS Third line +.TP +.B "clock mode" +Supported values are "UTC" or "LOCAL". Tells whether the Hardware Clock is set +to Coordinated Universal Time or local time. You can always override this +value with options on the hwclock command line. + +.SH FILES +.IR /etc/adjtime +.SH "SEE ALSO" +.BR hwclock (8), +.BR rtcwake (8) +.SH AVAILABILITY +This man page is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/blkdiscard.8 b/sys-utils/blkdiscard.8 new file mode 100644 index 0000000..1f3a32b --- /dev/null +++ b/sys-utils/blkdiscard.8 @@ -0,0 +1,85 @@ +.TH BLKDISCARD 8 "July 2014" "util-linux" "System Administration" +.SH NAME +blkdiscard \- discard sectors on a device +.SH SYNOPSIS +.B blkdiscard +[options] +.RB [ \-o +.IR offset ] +.RB [ \-l +.IR length ] +.I device +.SH DESCRIPTION +.B blkdiscard +is used to discard device sectors. This is useful for solid-state +drivers (SSDs) and thinly-provisioned storage. Unlike +.BR fstrim (8), +this command is used directly on the block device. +.PP +By default, +.B blkdiscard +will discard all blocks on the device. Options may be used to modify +this behavior based on range or size, as explained below. +.PP +The +.I device +argument is the pathname of the block device. +.PP +.B WARNING: All data in the discarded region on the device will be lost! +.SH OPTIONS +The +.I offset +and +.I length +arguments may be followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" is +optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB and YB. +.TP +.BR \-o , " \-\-offset \fIoffset" +Byte offset into the device from which to start discarding. The provided value +will be aligned to the device sector size. The default value is zero. +.TP +.BR \-l , " \-\-length \fIlength" +The number of bytes to discard (counting from the starting point). The provided value +will be aligned to the device sector size. If the specified value extends past +the end of the device, +.B blkdiscard +will stop at the device size boundary. The default value extends to the end +of the device. +.TP +.BR \-p , " \-\-step \fIlength" +The number of bytes to discard within one iteration. The default is to discard +all by one ioctl call. +.TP +.BR \-s , " \-\-secure" +Perform a secure discard. A secure discard is the same as a regular discard +except that all copies of the discarded blocks that were possibly created by +garbage collection must also be erased. This requires support from the device. +.TP +.BR \-z , " \-\-zeroout" +Zero-fill rather than discard. +.TP +.BR \-v , " \-\-verbose" +Display the aligned values of +.I offset +and +.IR length . +If the \fB\-\-step\fR option is specified, it prints the discard progress every second. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH AUTHOR +.MT lczerner@redhat.com +Lukas Czerner +.ME +.SH SEE ALSO +.BR fstrim (8) +.SH AVAILABILITY +The blkdiscard command is part of the util-linux package and is available +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/blkdiscard.c b/sys-utils/blkdiscard.c new file mode 100644 index 0000000..c19b67b --- /dev/null +++ b/sys-utils/blkdiscard.c @@ -0,0 +1,254 @@ +/* + * blkdiscard.c -- discard the part (or whole) of the block device. + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Written by Lukas Czerner <lczerner@redhat.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * This program uses BLKDISCARD ioctl to discard part or the whole block + * device if the device supports it. You can specify range (start and + * length) to be discarded, or simply discard the whole device. + */ + + +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <fcntl.h> +#include <limits.h> +#include <getopt.h> +#include <time.h> + +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <linux/fs.h> + +#include "nls.h" +#include "strutils.h" +#include "c.h" +#include "closestream.h" +#include "monotonic.h" + +#ifndef BLKDISCARD +# define BLKDISCARD _IO(0x12,119) +#endif + +#ifndef BLKSECDISCARD +# define BLKSECDISCARD _IO(0x12,125) +#endif + +#ifndef BLKZEROOUT +# define BLKZEROOUT _IO(0x12,127) +#endif + +enum { + ACT_DISCARD = 0, /* default */ + ACT_ZEROOUT, + ACT_SECURE +}; + +static void print_stats(int act, char *path, uint64_t stats[]) +{ + switch (act) { + case ACT_ZEROOUT: + printf(_("%s: Zero-filled %" PRIu64 " bytes from the offset %" PRIu64"\n"), \ + path, stats[1], stats[0]); + break; + case ACT_SECURE: + case ACT_DISCARD: + printf(_("%s: Discarded %" PRIu64 " bytes from the offset %" PRIu64"\n"), \ + path, stats[1], stats[0]); + break; + } +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] <device>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Discard the content of sectors on a device.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -o, --offset <num> offset in bytes to discard from\n"), out); + fputs(_(" -l, --length <num> length of bytes to discard from the offset\n"), out); + fputs(_(" -p, --step <num> size of the discard iterations within the offset\n"), out); + fputs(_(" -s, --secure perform secure discard\n"), out); + fputs(_(" -z, --zeroout zero-fill rather than discard\n"), out); + fputs(_(" -v, --verbose print aligned length and offset\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(21)); + + printf(USAGE_MAN_TAIL("blkdiscard(8)")); + exit(EXIT_SUCCESS); +} + + +int main(int argc, char **argv) +{ + char *path; + int c, fd, verbose = 0, secsize; + uint64_t end, blksize, step, range[2], stats[2]; + struct stat sb; + struct timeval now, last; + int act = ACT_DISCARD; + + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "offset", required_argument, NULL, 'o' }, + { "length", required_argument, NULL, 'l' }, + { "step", required_argument, NULL, 'p' }, + { "secure", no_argument, NULL, 's' }, + { "verbose", no_argument, NULL, 'v' }, + { "zeroout", no_argument, NULL, 'z' }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + range[0] = 0; + range[1] = ULLONG_MAX; + step = 0; + + while ((c = getopt_long(argc, argv, "hVsvo:l:p:z", longopts, NULL)) != -1) { + switch(c) { + case 'h': + usage(); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'l': + range[1] = strtosize_or_err(optarg, + _("failed to parse length")); + break; + case 'o': + range[0] = strtosize_or_err(optarg, + _("failed to parse offset")); + break; + case 'p': + step = strtosize_or_err(optarg, + _("failed to parse step")); + break; + case 's': + act = ACT_SECURE; + break; + case 'v': + verbose = 1; + break; + case 'z': + act = ACT_ZEROOUT; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optind == argc) + errx(EXIT_FAILURE, _("no device specified")); + + path = argv[optind++]; + + if (optind != argc) { + warnx(_("unexpected number of arguments")); + errtryhelp(EXIT_FAILURE); + } + + fd = open(path, O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), path); + + if (fstat(fd, &sb) == -1) + err(EXIT_FAILURE, _("stat of %s failed"), path); + if (!S_ISBLK(sb.st_mode)) + errx(EXIT_FAILURE, _("%s: not a block device"), path); + + if (ioctl(fd, BLKGETSIZE64, &blksize)) + err(EXIT_FAILURE, _("%s: BLKGETSIZE64 ioctl failed"), path); + if (ioctl(fd, BLKSSZGET, &secsize)) + err(EXIT_FAILURE, _("%s: BLKSSZGET ioctl failed"), path); + + /* check offset alignment to the sector size */ + if (range[0] % secsize) + errx(EXIT_FAILURE, _("%s: offset %" PRIu64 " is not aligned " + "to sector size %i"), path, range[0], secsize); + + /* is the range end behind the end of the device ?*/ + if (range[0] > blksize) + errx(EXIT_FAILURE, _("%s: offset is greater than device size"), path); + end = range[0] + range[1]; + if (end < range[0] || end > blksize) + end = blksize; + + range[1] = (step > 0) ? step : end - range[0]; + + /* check length alignment to the sector size */ + if (range[1] % secsize) + errx(EXIT_FAILURE, _("%s: length %" PRIu64 " is not aligned " + "to sector size %i"), path, range[1], secsize); + + stats[0] = range[0], stats[1] = 0; + gettime_monotonic(&last); + + for (/* nothing */; range[0] < end; range[0] += range[1]) { + if (range[0] + range[1] > end) + range[1] = end - range[0]; + + switch (act) { + case ACT_ZEROOUT: + if (ioctl(fd, BLKZEROOUT, &range)) + err(EXIT_FAILURE, _("%s: BLKZEROOUT ioctl failed"), path); + break; + case ACT_SECURE: + if (ioctl(fd, BLKSECDISCARD, &range)) + err(EXIT_FAILURE, _("%s: BLKSECDISCARD ioctl failed"), path); + break; + case ACT_DISCARD: + if (ioctl(fd, BLKDISCARD, &range)) + err(EXIT_FAILURE, _("%s: BLKDISCARD ioctl failed"), path); + break; + } + + stats[1] += range[1]; + + /* reporting progress at most once per second */ + if (verbose && step) { + gettime_monotonic(&now); + if (now.tv_sec > last.tv_sec && + (now.tv_usec >= last.tv_usec || now.tv_sec > last.tv_sec + 1)) { + print_stats(act, path, stats); + stats[0] += stats[1], stats[1] = 0; + last = now; + } + } + } + + if (verbose && stats[1]) + print_stats(act, path, stats); + + close(fd); + return EXIT_SUCCESS; +} diff --git a/sys-utils/blkzone.8 b/sys-utils/blkzone.8 new file mode 100644 index 0000000..bf7f15f --- /dev/null +++ b/sys-utils/blkzone.8 @@ -0,0 +1,109 @@ +.TH BLKZONE 8 "February 2017" "util-linux" "System Administration" +.SH NAME +blkzone \- run zone command on a device +.SH SYNOPSIS +.B blkzone +.I command +[options] +.I device +.SH DESCRIPTION +.B blkzone +is used to run zone command on device that support the Zoned Block Commands +(ZBC) or Zoned-device ATA Commands (ZAC). The zones to operate on can be +specified using the offset, count and length options. +.PP +The +.I device +argument is the pathname of the block device. +.SH COMMANDS +.SS report +The command \fBblkzone report\fP is used to report device zone information. +.PP +By default, the command will report all zones from the start of the +block device. Options may be used to modify this behavior, changing the +starting zone or the size of the report, as explained below. + +.B Report output +.TS +tab(:); +l l. +start:Zone start sector +len:Zone length in number of sectors +wptr:Zone write pointer position +reset:Reset write pointer recommended +non-seq:Non-sequential write resources active +cond:Zone condition +type:Zone type +.TE + +.B Zone conditions +.TS +tab(:); +l l. +cl:Closed +nw:Not write pointer +em:Empty +fu:Full +oe:Explicitly opened +oi:Implicitly opened +ol:Offline +ro:Read only +x?:Reserved conditions (should not be reported) +.TE + +.SS reset +The command \fBblkzone reset\fP is used to reset one or more zones. Unlike +.BR sg_reset_wp (8), +this command operates from the block layer and can reset a range of zones. +.PP +By default, the command will operate from the zone at device +sector 0 and reset all zones. Options may be used to modify this behavior +as well as specify the operation to be performed on the zone, as explained below. + +.SH OPTIONS +The +.I offset +and +.I length +option arguments may be followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" is +optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB and YB. +Additionally, the 0x prefix can be used to specify \fIoffset\fR and +\fIlength\fR in hex. +.TP +.BR \-o , " \-\-offset "\fIsector\fP +The starting zone specified as a sector offset. The provided offset in sector +units (512 bytes) should match the start of a zone. The default value is zero. +.TP +.BR \-l , " \-\-length "\fIsectors\fP +The maximum number of sectors the command should operate on. The default value +is the number of sectors remaining after \fIoffset\fR. This option cannot be +used together with the option \fB\-\-count\fP. +.TP +.BR \-c , " \-\-count "\fIcount\fP +The maximum number of zones the command should operate on. The default value +is the number of zones starting from \fIoffset\fR. This option cannot be +used together with the option \fB\-\-length\fP. +.TP +.BR \-v , " \-\-verbose" +Display the number of zones returned in the report or the range of sectors +reset.. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH AUTHORS +.nf +Shaun Tancheff <shaun@tancheff.com> +Karel Zak <kzak@redhat.com> +.fi +.SH SEE ALSO +.BR sg_rep_zones (8) +.SH AVAILABILITY +The blkzone command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/blkzone.c b/sys-utils/blkzone.c new file mode 100644 index 0000000..1dcbdf5 --- /dev/null +++ b/sys-utils/blkzone.c @@ -0,0 +1,416 @@ +/* + * blkzone.c -- the block device zone commands + * + * Copyright (C) 2015,2016 Seagate Technology PLC + * Written by Shaun Tancheff <shaun.tancheff@seagate.com> + * + * Copyright (C) 2017 Karel Zak <kzak@redhat.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <fcntl.h> +#include <limits.h> +#include <getopt.h> +#include <time.h> + +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <linux/fs.h> +#include <linux/blkzoned.h> + +#include "nls.h" +#include "strutils.h" +#include "xalloc.h" +#include "c.h" +#include "closestream.h" +#include "blkdev.h" +#include "sysfs.h" +#include "optutils.h" + +struct blkzone_control; + +static int blkzone_report(struct blkzone_control *ctl); +static int blkzone_reset(struct blkzone_control *ctl); + +struct blkzone_command { + const char *name; + int (*handler)(struct blkzone_control *); + const char *help; +}; + +struct blkzone_control { + const char *devname; + const struct blkzone_command *command; + + uint64_t total_sectors; + int secsize; + + uint64_t offset; + uint64_t length; + uint32_t count; + + unsigned int verbose : 1; +}; + +static const struct blkzone_command commands[] = { + { "report", blkzone_report, N_("Report zone information about the given device") }, + { "reset", blkzone_reset, N_("Reset a range of zones.") } +}; + +static const struct blkzone_command *name_to_command(const char *name) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(commands[i].name, name) == 0) + return &commands[i]; + } + + return NULL; +} + +static int init_device(struct blkzone_control *ctl, int mode) +{ + struct stat sb; + int fd; + + fd = open(ctl->devname, mode); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), ctl->devname); + + if (fstat(fd, &sb) == -1) + err(EXIT_FAILURE, _("stat of %s failed"), ctl->devname); + if (!S_ISBLK(sb.st_mode)) + errx(EXIT_FAILURE, _("%s: not a block device"), ctl->devname); + + if (blkdev_get_sectors(fd, (unsigned long long *) &ctl->total_sectors)) + err(EXIT_FAILURE, _("%s: blkdev_get_sectors ioctl failed"), ctl->devname); + + if (blkdev_get_sector_size(fd, &ctl->secsize)) + err(EXIT_FAILURE, _("%s: BLKSSZGET ioctl failed"), ctl->devname); + + return fd; +} + +/* + * Get the device zone size indicated by chunk sectors). + */ +static unsigned long blkdev_chunk_sectors(const char *dname) +{ + struct path_cxt *pc = NULL; + dev_t devno = sysfs_devname_to_devno(dname); + dev_t disk; + uint64_t sz = 0; + int rc; + + /* + * Mapping /dev/sdXn -> /sys/block/sdX to read the chunk_size entry. + * This method masks off the partition specified by the minor device + * component. + */ + pc = ul_new_sysfs_path(devno, NULL, NULL); + if (!pc) + return 0; + + rc = sysfs_blkdev_get_wholedisk(pc, NULL, 0, &disk); + if (rc != 0) + goto done; + + /* if @pc is not while-disk device, switch to disk */ + if (devno != disk) { + rc = sysfs_blkdev_init_path(pc, disk, NULL); + if (rc != 0) + goto done; + } + + rc = ul_path_read_u64(pc, &sz, "queue/chunk_sectors"); +done: + ul_unref_path(pc); + return rc == 0 ? sz : 0; +} + +/* + * blkzone report + */ +#define DEF_REPORT_LEN (1U << 12) /* 4k zones per report (256k kzalloc) */ + +static const char *type_text[] = { + "RESERVED", + "CONVENTIONAL", + "SEQ_WRITE_REQUIRED", + "SEQ_WRITE_PREFERRED", +}; + +static const char *condition_str[] = { + "nw", /* Not write pointer */ + "em", /* Empty */ + "oi", /* Implicitly opened */ + "oe", /* Explicitly opened */ + "cl", /* Closed */ + "x5", "x6", "x7", "x8", "x9", "xA", "xB", "xC", /* xN: reserved */ + "ro", /* Read only */ + "fu", /* Full */ + "of" /* Offline */ +}; + +static int blkzone_report(struct blkzone_control *ctl) +{ + struct blk_zone_report *zi; + unsigned long zonesize; + uint32_t i, nr_zones; + int fd; + + fd = init_device(ctl, O_RDONLY); + + if (ctl->offset >= ctl->total_sectors) + errx(EXIT_FAILURE, + _("%s: offset is greater than or equal to device size"), ctl->devname); + + zonesize = blkdev_chunk_sectors(ctl->devname); + if (!zonesize) + errx(EXIT_FAILURE, _("%s: unable to determine zone size"), ctl->devname); + + if (ctl->count) + nr_zones = ctl->count; + else if (ctl->length) + nr_zones = (ctl->length + zonesize - 1) / zonesize; + else + nr_zones = 1 + (ctl->total_sectors - ctl->offset) / zonesize; + + zi = xmalloc(sizeof(struct blk_zone_report) + + (DEF_REPORT_LEN * sizeof(struct blk_zone))); + + while (nr_zones && ctl->offset < ctl->total_sectors) { + + zi->nr_zones = min(nr_zones, DEF_REPORT_LEN); + zi->sector = ctl->offset; + + if (ioctl(fd, BLKREPORTZONE, zi) == -1) + err(EXIT_FAILURE, _("%s: BLKREPORTZONE ioctl failed"), ctl->devname); + + if (ctl->verbose) + printf(_("Found %d zones from 0x%"PRIx64"\n"), + zi->nr_zones, ctl->offset); + + if (!zi->nr_zones) { + nr_zones = 0; + break; + } + + for (i = 0; i < zi->nr_zones; i++) { + const struct blk_zone *entry = &zi->zones[i]; + unsigned int type = entry->type; + uint64_t start = entry->start; + uint64_t wp = entry->wp; + uint8_t cond = entry->cond; + uint64_t len = entry->len; + + if (!len) { + nr_zones = 0; + break; + } + + printf(_(" start: 0x%09"PRIx64", len 0x%06"PRIx64", wptr 0x%06"PRIx64 + " reset:%u non-seq:%u, zcond:%2u(%s) [type: %u(%s)]\n"), + start, len, (type == 0x1) ? 0 : wp - start, + entry->reset, entry->non_seq, + cond, condition_str[cond & (ARRAY_SIZE(condition_str) - 1)], + type, type_text[type]); + + nr_zones--; + ctl->offset = start + len; + + } + + } + + free(zi); + close(fd); + + return 0; +} + +/* + * blkzone reset + */ +static int blkzone_reset(struct blkzone_control *ctl) +{ + struct blk_zone_range za = { .sector = 0 }; + unsigned long zonesize; + uint64_t zlen; + int fd; + + zonesize = blkdev_chunk_sectors(ctl->devname); + if (!zonesize) + errx(EXIT_FAILURE, _("%s: unable to determine zone size"), ctl->devname); + + fd = init_device(ctl, O_WRONLY); + + if (ctl->offset & (zonesize - 1)) + errx(EXIT_FAILURE, _("%s: offset %" PRIu64 " is not aligned " + "to zone size %lu"), + ctl->devname, ctl->offset, zonesize); + + if (ctl->offset > ctl->total_sectors) + errx(EXIT_FAILURE, _("%s: offset is greater than device size"), ctl->devname); + + if (ctl->count) + zlen = ctl->count * zonesize; + else if (ctl->length) + zlen = ctl->length; + else + zlen = ctl->total_sectors; + if (ctl->offset + zlen > ctl->total_sectors) + zlen = ctl->total_sectors - ctl->offset; + + if (ctl->length && + (zlen & (zonesize - 1)) && + ctl->offset + zlen != ctl->total_sectors) + errx(EXIT_FAILURE, _("%s: number of sectors %" PRIu64 " is not aligned " + "to zone size %lu"), + ctl->devname, ctl->length, zonesize); + + za.sector = ctl->offset; + za.nr_sectors = zlen; + + if (ioctl(fd, BLKRESETZONE, &za) == -1) + err(EXIT_FAILURE, _("%s: BLKRESETZONE ioctl failed"), ctl->devname); + else if (ctl->verbose) + printf(_("%s: successfully reset in range from %" PRIu64 ", to %" PRIu64), + ctl->devname, + ctl->offset, + ctl->offset + zlen); + close(fd); + return 0; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s <command> [options] <device>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run zone command on the given block device.\n"), out); + + fputs(USAGE_COMMANDS, out); + for (i = 0; i < ARRAY_SIZE(commands); i++) + fprintf(out, " %-11s %s\n", commands[i].name, _(commands[i].help)); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -o, --offset <sector> start sector of zone to act (in 512-byte sectors)\n"), out); + fputs(_(" -l, --length <sectors> maximum sectors to act (in 512-byte sectors)\n"), out); + fputs(_(" -c, --count <number> maximum number of zones\n"), out); + fputs(_(" -v, --verbose display more details\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + + printf(USAGE_MAN_TAIL("blkzone(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int c; + struct blkzone_control ctl = { + .devname = NULL, + .offset = 0, + .count = 0, + .length = 0 + }; + + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "count", required_argument, NULL, 'c' }, /* max #of zones to operate on */ + { "length", required_argument, NULL, 'l' }, /* max of sectors to operate on */ + { "offset", required_argument, NULL, 'o' }, /* starting LBA */ + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'c', 'l' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + if (argc >= 2 && *argv[1] != '-') { + ctl.command = name_to_command(argv[1]); + if (!ctl.command) + errx(EXIT_FAILURE, _("%s is not valid command name"), argv[1]); + argv++; + argc--; + } + + while ((c = getopt_long(argc, argv, "hc:l:o:vV", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'h': + usage(); + break; + case 'c': + ctl.count = strtou32_or_err(optarg, + _("failed to parse number of zones")); + break; + case 'l': + ctl.length = strtosize_or_err(optarg, + _("failed to parse number of sectors")); + break; + case 'o': + ctl.offset = strtosize_or_err(optarg, + _("failed to parse zone offset")); + break; + case 'v': + ctl.verbose = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (!ctl.command) + errx(EXIT_FAILURE, _("no command specified")); + + if (optind == argc) + errx(EXIT_FAILURE, _("no device specified")); + ctl.devname = argv[optind++]; + + if (optind != argc) + errx(EXIT_FAILURE,_("unexpected number of arguments")); + + if (ctl.command->handler(&ctl) < 0) + return EXIT_FAILURE; + + return EXIT_SUCCESS; + +} diff --git a/sys-utils/chcpu.8 b/sys-utils/chcpu.8 new file mode 100644 index 0000000..2fb7111 --- /dev/null +++ b/sys-utils/chcpu.8 @@ -0,0 +1,106 @@ +.TH CHCPU 8 "July 2014" "util-linux" "System Administration" +.SH NAME +chcpu \- configure CPUs +.SH SYNOPSIS +.B chcpu +.BR \-c | \-d | \-e | \-g +.I cpu-list +.br +.B chcpu \-p +.I mode +.br +.B chcpu +.BR \-r | \-h | \-V +.SH DESCRIPTION +.B chcpu +can modify the state of CPUs. It can enable or disable CPUs, scan for new +CPUs, change the CPU dispatching +.I mode +of the underlying hypervisor, and request CPUs from the hypervisor +(configure) or return CPUs to the hypervisor (deconfigure). +.PP +Some options have a +.I cpu-list +argument. Use this argument to specify a comma-separated list of CPUs. The +list can contain individual CPU addresses or ranges of addresses. For +example, +.B 0,5,7,9-11 +makes the command applicable to the CPUs with the addresses 0, 5, 7, 9, 10, +and 11. +.SH OPTIONS +.TP +.BR \-c , " \-\-configure " \fIcpu-list\fP +Configure the specified CPUs. Configuring a CPU means that the hypervisor +takes a CPU from the CPU pool and assigns it to the virtual hardware on which +your kernel runs. +.TP +.BR \-d , " \-\-disable " \fIcpu-list\fP +Disable the specified CPUs. Disabling a CPU means that the kernel sets it +offline. +.TP +.BR \-e , " \-\-enable " \fIcpu-list\fP +Enable the specified CPUs. Enabling a CPU means that the kernel sets it +online. A CPU must be configured, see \fB\-c\fR, before it can be enabled. +.TP +.BR \-g , " \-\-deconfigure " \fIcpu-list\fP +Deconfigure the specified CPUs. Deconfiguring a CPU means that the +hypervisor removes the CPU from the virtual hardware on which the Linux +instance runs and returns it to the CPU pool. A CPU must be offline, see +\fB\-d\fR, before it can be deconfigured. +.TP +.BR \-p , " \-\-dispatch " \fImode\fP +Set the CPU dispatching +.I mode +(polarization). This option has an effect only if your hardware architecture +and hypervisor support CPU polarization. Available +.I modes +are: +.RS 14 +.TP 12 +.PD 0 +.B horizontal +The workload is spread across all available CPUs. +.TP 12 +.B vertical +The workload is concentrated on few CPUs. +.RE +.PD 1 +.TP +.BR \-r , " \-\-rescan" +Trigger a rescan of CPUs. After a rescan, the Linux kernel recognizes +the new CPUs. Use this option on systems that do not +automatically detect newly attached CPUs. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH RETURN CODES +.B chcpu +has the following return codes: +.TP +.B 0 +success +.TP +.B 1 +failure +.TP +.B 64 +partial success +.RE +.SH AUTHOR +.MT heiko.carstens@de.ibm.com +Heiko Carstens +.ME +.SH COPYRIGHT +Copyright IBM Corp. 2011 +.br +.SH "SEE ALSO" +.BR lscpu (1) +.SH AVAILABILITY +The chcpu command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/chcpu.c b/sys-utils/chcpu.c new file mode 100644 index 0000000..36c47af --- /dev/null +++ b/sys-utils/chcpu.c @@ -0,0 +1,389 @@ +/* + * chcpu - CPU configuration tool + * + * Copyright IBM Corp. 2011 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/utsname.h> +#include <unistd.h> +#include <stdarg.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "cpuset.h" +#include "nls.h" +#include "xalloc.h" +#include "c.h" +#include "strutils.h" +#include "bitops.h" +#include "path.h" +#include "closestream.h" +#include "optutils.h" + +#define EXCL_ERROR "--{configure,deconfigure,disable,dispatch,enable}" + +/* partial success, otherwise we return regular EXIT_{SUCCESS,FAILURE} */ +#define CHCPU_EXIT_SOMEOK 64 + +#define _PATH_SYS_CPU "/sys/devices/system/cpu" + +static cpu_set_t *onlinecpus; +static int maxcpus; + +#define is_cpu_online(cpu) (CPU_ISSET_S((cpu), CPU_ALLOC_SIZE(maxcpus), onlinecpus)) +#define num_online_cpus() (CPU_COUNT_S(CPU_ALLOC_SIZE(maxcpus), onlinecpus)) + +enum { + CMD_CPU_ENABLE = 0, + CMD_CPU_DISABLE, + CMD_CPU_CONFIGURE, + CMD_CPU_DECONFIGURE, + CMD_CPU_RESCAN, + CMD_CPU_DISPATCH_HORIZONTAL, + CMD_CPU_DISPATCH_VERTICAL, +}; + +/* returns: 0 = success + * < 0 = failure + * > 0 = partial success + */ +static int cpu_enable(struct path_cxt *sys, cpu_set_t *cpu_set, size_t setsize, int enable) +{ + int cpu; + int online, rc; + int configured = -1; + int fails = 0; + + for (cpu = 0; cpu < maxcpus; cpu++) { + if (!CPU_ISSET_S(cpu, setsize, cpu_set)) + continue; + if (ul_path_accessf(sys, F_OK, "cpu%d", cpu) != 0) { + warnx(_("CPU %u does not exist"), cpu); + fails++; + continue; + } + if (ul_path_accessf(sys, F_OK, "cpu%d/online", cpu) != 0) { + warnx(_("CPU %u is not hot pluggable"), cpu); + fails++; + continue; + } + if (ul_path_readf_s32(sys, &online, "cpu%d/online", cpu) == 0 + && online == 1 + && enable == 1) { + printf(_("CPU %u is already enabled\n"), cpu); + continue; + } + if (online == 0 && enable == 0) { + printf(_("CPU %u is already disabled\n"), cpu); + continue; + } + if (ul_path_accessf(sys, F_OK, "cpu%d/configure", cpu) == 0) + ul_path_readf_s32(sys, &configured, "cpu%d/configure", cpu); + if (enable) { + rc = ul_path_writef_string(sys, "1", "cpu%d/online", cpu); + if (rc != 0 && configured == 0) { + warn(_("CPU %u enable failed (CPU is deconfigured)"), cpu); + fails++; + } else if (rc != 0) { + warn(_("CPU %u enable failed"), cpu); + fails++; + } else + printf(_("CPU %u enabled\n"), cpu); + } else { + if (onlinecpus && num_online_cpus() == 1) { + warnx(_("CPU %u disable failed (last enabled CPU)"), cpu); + fails++; + continue; + } + rc = ul_path_writef_string(sys, "0", "cpu%d/online", cpu); + if (rc != 0) { + warn(_("CPU %u disable failed"), cpu); + fails++; + } else { + printf(_("CPU %u disabled\n"), cpu); + if (onlinecpus) + CPU_CLR_S(cpu, setsize, onlinecpus); + } + } + } + + return fails == 0 ? 0 : fails == maxcpus ? -1 : 1; +} + +static int cpu_rescan(struct path_cxt *sys) +{ + if (ul_path_access(sys, F_OK, "rescan") != 0) + errx(EXIT_FAILURE, _("This system does not support rescanning of CPUs")); + + if (ul_path_write_string(sys, "1", "rescan") != 0) + err(EXIT_FAILURE, _("Failed to trigger rescan of CPUs")); + + printf(_("Triggered rescan of CPUs\n")); + return 0; +} + +static int cpu_set_dispatch(struct path_cxt *sys, int mode) +{ + if (ul_path_access(sys, F_OK, "dispatching") != 0) + errx(EXIT_FAILURE, _("This system does not support setting " + "the dispatching mode of CPUs")); + if (mode == 0) { + if (ul_path_write_string(sys, "0", "dispatching") != 0) + err(EXIT_FAILURE, _("Failed to set horizontal dispatch mode")); + + printf(_("Successfully set horizontal dispatching mode\n")); + } else { + if (ul_path_write_string(sys, "1", "dispatching") != 0) + err(EXIT_FAILURE, _("Failed to set vertical dispatch mode")); + + printf(_("Successfully set vertical dispatching mode\n")); + } + return 0; +} + +/* returns: 0 = success + * < 0 = failure + * > 0 = partial success + */ +static int cpu_configure(struct path_cxt *sys, cpu_set_t *cpu_set, size_t setsize, int configure) +{ + int cpu; + int rc, current; + int fails = 0; + + for (cpu = 0; cpu < maxcpus; cpu++) { + if (!CPU_ISSET_S(cpu, setsize, cpu_set)) + continue; + if (ul_path_accessf(sys, F_OK, "cpu%d", cpu) != 0) { + warnx(_("CPU %u does not exist"), cpu); + fails++; + continue; + } + if (ul_path_accessf(sys, F_OK, "cpu%d/configure", cpu) != 0) { + warnx(_("CPU %u is not configurable"), cpu); + fails++; + continue; + } + ul_path_readf_s32(sys, ¤t, "cpu%d/configure", cpu); + if (current == 1 && configure == 1) { + printf(_("CPU %u is already configured\n"), cpu); + continue; + } + if (current == 0 && configure == 0) { + printf(_("CPU %u is already deconfigured\n"), cpu); + continue; + } + if (current == 1 && configure == 0 && onlinecpus && + is_cpu_online(cpu)) { + warnx(_("CPU %u deconfigure failed (CPU is enabled)"), cpu); + fails++; + continue; + } + if (configure) { + rc = ul_path_writef_string(sys, "1", "cpu%d/configure", cpu); + if (rc != 0) { + warn(_("CPU %u configure failed"), cpu); + fails++; + } else + printf(_("CPU %u configured\n"), cpu); + } else { + rc = ul_path_writef_string(sys, "0", "cpu%d/configure", cpu); + if (rc != 0) { + warn(_("CPU %u deconfigure failed"), cpu); + fails++; + } else + printf(_("CPU %u deconfigured\n"), cpu); + } + } + + return fails == 0 ? 0 : fails == maxcpus ? -1 : 1; +} + +static void cpu_parse(char *cpu_string, cpu_set_t *cpu_set, size_t setsize) +{ + int rc; + + rc = cpulist_parse(cpu_string, cpu_set, setsize, 1); + if (rc == 0) + return; + if (rc == 2) + errx(EXIT_FAILURE, _("invalid CPU number in CPU list: %s"), cpu_string); + errx(EXIT_FAILURE, _("failed to parse CPU list: %s"), cpu_string); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fprintf(out, _( + "\nUsage:\n" + " %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Configure CPUs in a multi-processor system.\n"), out); + + fputs(USAGE_OPTIONS, stdout); + fputs(_( + " -e, --enable <cpu-list> enable cpus\n" + " -d, --disable <cpu-list> disable cpus\n" + " -c, --configure <cpu-list> configure cpus\n" + " -g, --deconfigure <cpu-list> deconfigure cpus\n" + " -p, --dispatch <mode> set dispatching mode\n" + " -r, --rescan trigger rescan of cpus\n" + ), stdout); + printf(USAGE_HELP_OPTIONS(31)); + + printf(USAGE_MAN_TAIL("chcpu(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + struct path_cxt *sys = NULL; /* _PATH_SYS_CPU handler */ + cpu_set_t *cpu_set; + size_t setsize; + int cmd = -1; + int c, rc; + + static const struct option longopts[] = { + { "configure", required_argument, NULL, 'c' }, + { "deconfigure",required_argument, NULL, 'g' }, + { "disable", required_argument, NULL, 'd' }, + { "dispatch", required_argument, NULL, 'p' }, + { "enable", required_argument, NULL, 'e' }, + { "help", no_argument, NULL, 'h' }, + { "rescan", no_argument, NULL, 'r' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'c','d','e','g','p' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + ul_path_init_debug(); + sys = ul_new_path(_PATH_SYS_CPU); + if (!sys) + err(EXIT_FAILURE, _("failed to initialize sysfs handler")); + + maxcpus = get_max_number_of_cpus(); + if (maxcpus < 1) + errx(EXIT_FAILURE, _("cannot determine NR_CPUS; aborting")); + + if (ul_path_access(sys, F_OK, "online") == 0) + ul_path_readf_cpulist(sys, &cpu_set, maxcpus, "online"); + + setsize = CPU_ALLOC_SIZE(maxcpus); + cpu_set = CPU_ALLOC(maxcpus); + if (!cpu_set) + err(EXIT_FAILURE, _("cpuset_alloc failed")); + + while ((c = getopt_long(argc, argv, "c:d:e:g:hp:rV", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'c': + cmd = CMD_CPU_CONFIGURE; + cpu_parse(argv[optind - 1], cpu_set, setsize); + break; + case 'd': + cmd = CMD_CPU_DISABLE; + cpu_parse(argv[optind - 1], cpu_set, setsize); + break; + case 'e': + cmd = CMD_CPU_ENABLE; + cpu_parse(argv[optind - 1], cpu_set, setsize); + break; + case 'g': + cmd = CMD_CPU_DECONFIGURE; + cpu_parse(argv[optind - 1], cpu_set, setsize); + break; + case 'h': + usage(); + case 'p': + if (strcmp("horizontal", argv[optind - 1]) == 0) + cmd = CMD_CPU_DISPATCH_HORIZONTAL; + else if (strcmp("vertical", argv[optind - 1]) == 0) + cmd = CMD_CPU_DISPATCH_VERTICAL; + else + errx(EXIT_FAILURE, _("unsupported argument: %s"), + argv[optind -1 ]); + break; + case 'r': + cmd = CMD_CPU_RESCAN; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if ((argc == 1) || (argc != optind)) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + switch (cmd) { + case CMD_CPU_ENABLE: + rc = cpu_enable(sys, cpu_set, maxcpus, 1); + break; + case CMD_CPU_DISABLE: + rc = cpu_enable(sys, cpu_set, maxcpus, 0); + break; + case CMD_CPU_CONFIGURE: + rc = cpu_configure(sys, cpu_set, maxcpus, 1); + break; + case CMD_CPU_DECONFIGURE: + rc = cpu_configure(sys, cpu_set, maxcpus, 0); + break; + case CMD_CPU_RESCAN: + rc = cpu_rescan(sys); + break; + case CMD_CPU_DISPATCH_HORIZONTAL: + rc = cpu_set_dispatch(sys, 0); + break; + case CMD_CPU_DISPATCH_VERTICAL: + rc = cpu_set_dispatch(sys, 1); + break; + default: + rc = -EINVAL; + break; + } + + ul_unref_path(sys); + + return rc == 0 ? EXIT_SUCCESS : + rc < 0 ? EXIT_FAILURE : CHCPU_EXIT_SOMEOK; +} diff --git a/sys-utils/chmem.8 b/sys-utils/chmem.8 new file mode 100644 index 0000000..8a3b34d --- /dev/null +++ b/sys-utils/chmem.8 @@ -0,0 +1,114 @@ +.TH CHMEM 8 "October 2016" "util-linux" "System Administration" +.SH NAME +chmem \- configure memory +.SH SYNOPSIS +.B chmem +.RB [ \-h "] [" \-V "] [" \-v "] [" \-e | \-d "]" +[\fISIZE\fP|\fIRANGE\fP|\fB\-b\fP \fIBLOCKRANGE\fP] +[-z ZONE] +.SH DESCRIPTION +The chmem command sets a particular size or range of memory online or offline. +. +.IP "\(hy" 2 +Specify \fISIZE\fP as <size>[m|M|g|G]. With m or M, <size> specifies the memory +size in MiB (1024 x 1024 bytes). With g or G, <size> specifies the memory size +in GiB (1024 x 1024 x 1024 bytes). The default unit is MiB. +. +.IP "\(hy" 2 +Specify \fIRANGE\fP in the form 0x<start>-0x<end> as shown in the output of the +\fBlsmem\fP command. <start> is the hexadecimal address of the first byte and <end> +is the hexadecimal address of the last byte in the memory range. +. +.IP "\(hy" 2 +Specify \fIBLOCKRANGE\fP in the form <first>-<last> or <block> as shown in the +output of the \fBlsmem\fP command. <first> is the number of the first memory block +and <last> is the number of the last memory block in the memory +range. Alternatively a single block can be specified. \fIBLOCKRANGE\fP requires +the \fB--blocks\fP option. +. +.IP "\(hy" 2 +Specify \fIZONE\fP as the name of a memory zone, as shown in the output of the +\fBlsmem -o +ZONES\fP command. The output shows one or more valid memory zones +for each memory range. If multiple zones are shown, then the memory range +currently belongs to the first zone. By default, chmem will set memory online +to the zone Movable, if this is among the valid zones. This default can be +changed by specifying the \fB--zone\fP option with another valid zone. +For memory ballooning, it is recommended to select the zone Movable for memory +online and offline, if possible. Memory in this zone is much more likely to be +able to be offlined again, but it cannot be used for arbitrary kernel +allocations, only for migratable pages (e.g. anonymous and page cache pages). +Use the \fB\-\-help\fR option to see all available zones. +. +.PP +\fISIZE\fP and \fIRANGE\fP must be aligned to the Linux memory block size, as +shown in the output of the \fBlsmem\fP command. + +Setting memory online can fail for various reasons. On virtualized systems it +can fail if the hypervisor does not have enough memory left, for example +because memory was overcommitted. Setting memory offline can fail if Linux +cannot free the memory. If only part of the requested memory can be set online +or offline, a message tells you how much memory was set online or offline +instead of the requested amount. + +When setting memory online \fBchmem\fP starts with the lowest memory block +numbers. When setting memory offline \fBchmem\fP starts with the highest memory +block numbers. +.SH OPTIONS +.TP +.BR \-b ", " \-\-blocks +Use a \fIBLOCKRANGE\fP parameter instead of \fIRANGE\fP or \fISIZE\fP for the +\fB--enable\fP and \fB--disable\fP options. +.TP +.BR \-d ", " \-\-disable +Set the specified \fIRANGE\fP, \fISIZE\fP, or \fIBLOCKRANGE\fP of memory offline. +.TP +.BR \-e ", " \-\-enable +Set the specified \fIRANGE\fP, \fISIZE\fP, or \fIBLOCKRANGE\fP of memory online. +.TP +.BR \-z ", " \-\-zone +Select the memory \fIZONE\fP where to set the specified \fIRANGE\fP, \fISIZE\fP, +or \fIBLOCKRANGE\fP of memory online or offline. By default, memory will be set +online to the zone Movable, if possible. +.TP +.BR \-h ", " \-\-help +Print a short help text, then exit. +.TP +.BR \-v ", " \-\-verbose +Verbose mode. Causes \fBchmem\fP to print debugging messages about it's +progress. +.TP +.BR \-V ", " \-\-version +Print the version number, then exit. +.SH RETURN CODES +.B chmem +has the following return codes: +.TP +.B 0 +success +.TP +.B 1 +failure +.TP +.B 64 +partial success +.SH EXAMPLES +.TP +.B chmem --enable 1024 +This command requests 1024 MiB of memory to be set online. +.TP +.B chmem -e 2g +This command requests 2 GiB of memory to be set online. +.TP +.B chmem --disable 0x00000000e4000000-0x00000000f3ffffff +This command requests the memory range starting with 0x00000000e4000000 +and ending with 0x00000000f3ffffff to be set offline. +.TP +.B chmem -b -d 10 +This command requests the memory block number 10 to be set offline. +.SH SEE ALSO +.BR lsmem (1) +.SH AVAILABILITY +The \fBchmem\fP command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/chmem.c b/sys-utils/chmem.c new file mode 100644 index 0000000..861f6cf --- /dev/null +++ b/sys-utils/chmem.c @@ -0,0 +1,453 @@ +/* + * chmem - Memory configuration tool + * + * Copyright IBM Corp. 2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <getopt.h> +#include <assert.h> +#include <dirent.h> + +#include "c.h" +#include "nls.h" +#include "path.h" +#include "strutils.h" +#include "strv.h" +#include "optutils.h" +#include "closestream.h" +#include "xalloc.h" + +/* partial success, otherwise we return regular EXIT_{SUCCESS,FAILURE} */ +#define CHMEM_EXIT_SOMEOK 64 + +#define _PATH_SYS_MEMORY "/sys/devices/system/memory" + +struct chmem_desc { + struct path_cxt *sysmem; /* _PATH_SYS_MEMORY handler */ + struct dirent **dirs; + int ndirs; + uint64_t block_size; + uint64_t start; + uint64_t end; + uint64_t size; + unsigned int use_blocks : 1; + unsigned int is_size : 1; + unsigned int verbose : 1; + unsigned int have_zones : 1; +}; + +enum { + CMD_MEMORY_ENABLE = 0, + CMD_MEMORY_DISABLE, + CMD_NONE +}; + +enum zone_id { + ZONE_DMA = 0, + ZONE_DMA32, + ZONE_NORMAL, + ZONE_HIGHMEM, + ZONE_MOVABLE, + ZONE_DEVICE, +}; + +static char *zone_names[] = { + [ZONE_DMA] = "DMA", + [ZONE_DMA32] = "DMA32", + [ZONE_NORMAL] = "Normal", + [ZONE_HIGHMEM] = "Highmem", + [ZONE_MOVABLE] = "Movable", + [ZONE_DEVICE] = "Device", +}; + +/* + * name must be null-terminated + */ +static int zone_name_to_id(const char *name) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(zone_names); i++) { + if (!strcasecmp(name, zone_names[i])) + return i; + } + return -1; +} + +static void idxtostr(struct chmem_desc *desc, uint64_t idx, char *buf, size_t bufsz) +{ + uint64_t start, end; + + start = idx * desc->block_size; + end = start + desc->block_size - 1; + snprintf(buf, bufsz, + _("Memory Block %"PRIu64" (0x%016"PRIx64"-0x%016"PRIx64")"), + idx, start, end); +} + +static int chmem_size(struct chmem_desc *desc, int enable, int zone_id) +{ + char *name, *onoff, line[BUFSIZ], str[BUFSIZ]; + uint64_t size, index; + const char *zn; + int i, rc; + + size = desc->size; + onoff = enable ? "online" : "offline"; + i = enable ? 0 : desc->ndirs - 1; + + if (enable && zone_id >= 0) { + if (zone_id == ZONE_MOVABLE) + onoff = "online_movable"; + else + onoff = "online_kernel"; + } + + for (; i >= 0 && i < desc->ndirs && size; i += enable ? 1 : -1) { + name = desc->dirs[i]->d_name; + index = strtou64_or_err(name + 6, _("Failed to parse index")); + + if (ul_path_readf_buffer(desc->sysmem, line, sizeof(line), "%s/state", name) > 0 + && strncmp(onoff, line, 6) == 0) + continue; + + if (desc->have_zones) { + ul_path_readf_buffer(desc->sysmem, line, sizeof(line), "%s/valid_zones", name); + if (zone_id >= 0) { + zn = zone_names[zone_id]; + if (enable && !strcasestr(line, zn)) + continue; + if (!enable && strncasecmp(line, zn, strlen(zn))) + continue; + } else if (enable) { + /* By default, use zone Movable for online, if valid */ + if (strcasestr(line, zone_names[ZONE_MOVABLE])) + onoff = "online_movable"; + else + onoff = "online"; + } + } + + idxtostr(desc, index, str, sizeof(str)); + rc = ul_path_writef_string(desc->sysmem, onoff, "%s/state", name); + if (rc != 0 && desc->verbose) { + if (enable) + fprintf(stdout, _("%s enable failed\n"), str); + else + fprintf(stdout, _("%s disable failed\n"), str); + } else if (rc == 0 && desc->verbose) { + if (enable) + fprintf(stdout, _("%s enabled\n"), str); + else + fprintf(stdout, _("%s disabled\n"), str); + } + if (rc == 0) + size--; + } + if (size) { + uint64_t bytes; + char *sizestr; + + bytes = (desc->size - size) * desc->block_size; + sizestr = size_to_human_string(SIZE_SUFFIX_1LETTER, bytes); + if (enable) + warnx(_("Could only enable %s of memory"), sizestr); + else + warnx(_("Could only disable %s of memory"), sizestr); + free(sizestr); + } + return size == 0 ? 0 : size == desc->size ? -1 : 1; +} + +static int chmem_range(struct chmem_desc *desc, int enable, int zone_id) +{ + char *name, *onoff, line[BUFSIZ], str[BUFSIZ]; + uint64_t index, todo; + const char *zn; + int i, rc; + + todo = desc->end - desc->start + 1; + onoff = enable ? "online" : "offline"; + + if (enable && zone_id >= 0) { + if (zone_id == ZONE_MOVABLE) + onoff = "online_movable"; + else + onoff = "online_kernel"; + } + + for (i = 0; i < desc->ndirs; i++) { + name = desc->dirs[i]->d_name; + index = strtou64_or_err(name + 6, _("Failed to parse index")); + if (index < desc->start) + continue; + if (index > desc->end) + break; + idxtostr(desc, index, str, sizeof(str)); + if (ul_path_readf_buffer(desc->sysmem, line, sizeof(line), "%s/state", name) > 0 + && strncmp(onoff, line, 6) == 0) { + if (desc->verbose && enable) + fprintf(stdout, _("%s already enabled\n"), str); + else if (desc->verbose && !enable) + fprintf(stdout, _("%s already disabled\n"), str); + todo--; + continue; + } + + if (desc->have_zones) { + ul_path_readf_buffer(desc->sysmem, line, sizeof(line), "%s/valid_zones", name); + if (zone_id >= 0) { + zn = zone_names[zone_id]; + if (enable && !strcasestr(line, zn)) { + warnx(_("%s enable failed: Zone mismatch"), str); + continue; + } + if (!enable && strncasecmp(line, zn, strlen(zn))) { + warnx(_("%s disable failed: Zone mismatch"), str); + continue; + } + } else if (enable) { + /* By default, use zone Movable for online, if valid */ + if (strcasestr(line, zone_names[ZONE_MOVABLE])) + onoff = "online_movable"; + else + onoff = "online"; + } + } + + rc = ul_path_writef_string(desc->sysmem, onoff, "%s/state", name); + if (rc != 0) { + if (enable) + warn(_("%s enable failed"), str); + else + warn(_("%s disable failed"), str); + } else if (desc->verbose) { + if (enable) + fprintf(stdout, _("%s enabled\n"), str); + else + fprintf(stdout, _("%s disabled\n"), str); + } + if (rc == 0) + todo--; + } + return todo == 0 ? 0 : todo == desc->end - desc->start + 1 ? -1 : 1; +} + +static int filter(const struct dirent *de) +{ + if (strncmp("memory", de->d_name, 6)) + return 0; + return isdigit_string(de->d_name + 6); +} + +static void read_info(struct chmem_desc *desc) +{ + char line[128]; + + desc->ndirs = scandir(_PATH_SYS_MEMORY, &desc->dirs, filter, versionsort); + if (desc->ndirs <= 0) + err(EXIT_FAILURE, _("Failed to read %s"), _PATH_SYS_MEMORY); + ul_path_read_buffer(desc->sysmem, line, sizeof(line), "block_size_bytes"); + desc->block_size = strtoumax(line, NULL, 16); +} + +static void parse_single_param(struct chmem_desc *desc, char *str) +{ + if (desc->use_blocks) { + desc->start = strtou64_or_err(str, _("Failed to parse block number")); + desc->end = desc->start; + return; + } + desc->is_size = 1; + desc->size = strtosize_or_err(str, _("Failed to parse size")); + if (isdigit(str[strlen(str) - 1])) + desc->size *= 1024*1024; + if (desc->size % desc->block_size) { + errx(EXIT_FAILURE, _("Size must be aligned to memory block size (%s)"), + size_to_human_string(SIZE_SUFFIX_1LETTER, desc->block_size)); + } + desc->size /= desc->block_size; +} + +static void parse_range_param(struct chmem_desc *desc, char *start, char *end) +{ + if (desc->use_blocks) { + desc->start = strtou64_or_err(start, _("Failed to parse start")); + desc->end = strtou64_or_err(end, _("Failed to parse end")); + return; + } + if (strlen(start) < 2 || start[1] != 'x') + errx(EXIT_FAILURE, _("Invalid start address format: %s"), start); + if (strlen(end) < 2 || end[1] != 'x') + errx(EXIT_FAILURE, _("Invalid end address format: %s"), end); + desc->start = strtox64_or_err(start, _("Failed to parse start address")); + desc->end = strtox64_or_err(end, _("Failed to parse end address")); + if (desc->start % desc->block_size || (desc->end + 1) % desc->block_size) { + errx(EXIT_FAILURE, + _("Start address and (end address + 1) must be aligned to " + "memory block size (%s)"), + size_to_human_string(SIZE_SUFFIX_1LETTER, desc->block_size)); + } + desc->start /= desc->block_size; + desc->end /= desc->block_size; +} + +static void parse_parameter(struct chmem_desc *desc, char *param) +{ + char **split; + + split = strv_split(param, "-"); + if (strv_length(split) > 2) + errx(EXIT_FAILURE, _("Invalid parameter: %s"), param); + if (strv_length(split) == 1) + parse_single_param(desc, split[0]); + else + parse_range_param(desc, split[0], split[1]); + strv_free(split); + if (desc->start > desc->end) + errx(EXIT_FAILURE, _("Invalid range: %s"), param); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [SIZE|RANGE|BLOCKRANGE]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Set a particular size or range of memory online or offline.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -e, --enable enable memory\n"), out); + fputs(_(" -d, --disable disable memory\n"), out); + fputs(_(" -b, --blocks use memory blocks\n"), out); + fputs(_(" -z, --zone <name> select memory zone (see below)\n"), out); + fputs(_(" -v, --verbose verbose output\n"), out); + printf(USAGE_HELP_OPTIONS(20)); + + fputs(_("\nSupported zones:\n"), out); + for (i = 0; i < ARRAY_SIZE(zone_names); i++) + fprintf(out, " %s\n", zone_names[i]); + + printf(USAGE_MAN_TAIL("chmem(8)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + struct chmem_desc _desc = { 0 }, *desc = &_desc; + int cmd = CMD_NONE, zone_id = -1; + char *zone = NULL; + int c, rc; + + static const struct option longopts[] = { + {"block", no_argument, NULL, 'b'}, + {"disable", no_argument, NULL, 'd'}, + {"enable", no_argument, NULL, 'e'}, + {"help", no_argument, NULL, 'h'}, + {"verbose", no_argument, NULL, 'v'}, + {"version", no_argument, NULL, 'V'}, + {"zone", required_argument, NULL, 'z'}, + {NULL, 0, NULL, 0} + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'd','e' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + ul_path_init_debug(); + desc->sysmem = ul_new_path(_PATH_SYS_MEMORY); + if (!desc->sysmem) + err(EXIT_FAILURE, _("failed to initialize %s handler"), _PATH_SYS_MEMORY); + + read_info(desc); + + while ((c = getopt_long(argc, argv, "bdehvVz:", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'd': + cmd = CMD_MEMORY_DISABLE; + break; + case 'e': + cmd = CMD_MEMORY_ENABLE; + break; + case 'b': + desc->use_blocks = 1; + break; + case 'h': + usage(); + break; + case 'v': + desc->verbose = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'z': + zone = xstrdup(optarg); + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if ((argc == 1) || (argc != optind + 1) || (cmd == CMD_NONE)) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + parse_parameter(desc, argv[optind]); + + + /* The valid_zones sysfs attribute was introduced with kernel 3.18 */ + if (ul_path_access(desc->sysmem, F_OK, "memory0/valid_zones") == 0) + desc->have_zones = 1; + else if (zone) + warnx(_("zone ignored, no valid_zones sysfs attribute present")); + + if (zone && desc->have_zones) { + zone_id = zone_name_to_id(zone); + if (zone_id == -1) { + warnx(_("unknown memory zone: %s"), zone); + errtryhelp(EXIT_FAILURE); + } + } + + if (desc->is_size) + rc = chmem_size(desc, cmd == CMD_MEMORY_ENABLE ? 1 : 0, zone_id); + else + rc = chmem_range(desc, cmd == CMD_MEMORY_ENABLE ? 1 : 0, zone_id); + + ul_unref_path(desc->sysmem); + + return rc == 0 ? EXIT_SUCCESS : + rc < 0 ? EXIT_FAILURE : CHMEM_EXIT_SOMEOK; +} diff --git a/sys-utils/choom.1 b/sys-utils/choom.1 new file mode 100644 index 0000000..2b844cb --- /dev/null +++ b/sys-utils/choom.1 @@ -0,0 +1,82 @@ +.TH CHOOM 1 "April 2018" "util-linux" "User Commands" +.SH NAME +choom \- display and adjust OOM-killer score. +.SH SYNOPSIS +.B choom +.B \-p +.I pid +.sp +.B choom +.B \-p +.I pid +.B \-n +.I number +.sp +.B choom +.B \-n +.I number +.IR command\ [ argument ...] + +.SH DESCRIPTION +The \fBchoom\fP command displays and adjusts Out-Of-Memory killer score setting. + +.SH OPTIONS +.TP +.BR \-p ", " \-\-pid " \fIpid\fP +Specifies process ID. +.TP +.BR \-n , " \-\-adjust " \fIvalue\fP +Specify the adjust score value. +.TP +.BR \-h ", " \-\-help +Display help text and exit. +.TP +.BR \-V ", " \-\-version +Display version information and exit. +.SH NOTES +Linux kernel uses the badness heuristic to select which process gets killed in +out of memory conditions. + +The badness heuristic assigns a value to each candidate task ranging from 0 +(never kill) to 1000 (always kill) to determine which process is targeted. The +units are roughly a proportion along that range of allowed memory the process +may allocate from based on an estimation of its current memory and swap use. +For example, if a task is using all allowed memory, its badness score will be +1000. If it is using half of its allowed memory, its score will be 500. + +There is an additional factor included in the badness score: the current memory +and swap usage is discounted by 3% for root processes. + +The amount of "allowed" memory depends on the context in which the oom killer +was called. If it is due to the memory assigned to the allocating task's cpuset +being exhausted, the allowed memory represents the set of mems assigned to that +cpuset. If it is due to a mempolicy's node(s) being exhausted, the allowed +memory represents the set of mempolicy nodes. If it is due to a memory +limit (or swap limit) being reached, the allowed memory is that configured +limit. Finally, if it is due to the entire system being out of memory, the +allowed memory represents all allocatable resources. + +The adjust score value is added to the badness score before it is used to +determine which task to kill. Acceptable values range from -1000 to +1000. +This allows userspace to polarize the preference for oom killing either by +always preferring a certain task or completely disabling it. The lowest +possible value, -1000, is equivalent to disabling oom killing entirely for that +task since it will always report a badness score of 0. + +Setting an adjust score value of +500, for example, is roughly equivalent to +allowing the remainder of tasks sharing the same system, cpuset, mempolicy, or +memory controller resources to use at least 50% more memory. A value of -500, +on the other hand, would be roughly equivalent to discounting 50% of the task's +allowed memory from being considered as scoring against the task. + +.SH AUTHORS +.nf +Karel Zak <kzak@redhat.com> +.fi +.SH SEE ALSO +.BR proc (5) +.SH AVAILABILITY +The \fBchoom\fP command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/choom.c b/sys-utils/choom.c new file mode 100644 index 0000000..eff95b6 --- /dev/null +++ b/sys-utils/choom.c @@ -0,0 +1,159 @@ +/* + * choom - Change OOM score setting + * + * Copyright (C) 2018 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <getopt.h> +#include <errno.h> + +#include "nls.h" +#include "c.h" +#include "path.h" +#include "strutils.h" +#include "closestream.h" + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %1$s [options] -p pid\n" + " %1$s [options] -n number -p pid\n" + " %1$s [options] -n number command [args...]]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Display and adjust OOM-killer score.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -n, --adjust <num> specify the adjust score value\n"), out); + fputs(_(" -p, --pid <num> process ID\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + printf(USAGE_MAN_TAIL("choom(1)")); + exit(EXIT_SUCCESS); +} + +static int get_score(struct path_cxt *pc) +{ + int ret; + + if (ul_path_read_s32(pc, &ret, "oom_score") != 0) + err(EXIT_FAILURE, _("failed to read OOM score value")); + + return ret; +} + +static int get_score_adj(struct path_cxt *pc) +{ + int ret; + + if (ul_path_read_s32(pc, &ret, "oom_score_adj") != 0) + err(EXIT_FAILURE, _("failed to read OOM score adjust value")); + + return ret; +} + +static int set_score_adj(struct path_cxt *pc, int adj) +{ + return ul_path_write_s64(pc, adj, "oom_score_adj"); +} + +int main(int argc, char **argv) +{ + pid_t pid = 0; + int c, adj = 0, has_adj = 0; + struct path_cxt *pc = NULL; + + static const struct option longopts[] = { + { "adjust", required_argument, NULL, 'n' }, + { "pid", required_argument, NULL, 'p' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "hn:p:V", longopts, NULL)) != -1) { + switch (c) { + case 'p': + pid = strtos32_or_err(optarg, _("invalid PID argument")); + break; + case 'n': + adj = strtos32_or_err(optarg, _("invalid adjust argument")); + has_adj = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optind < argc && pid) { + warnx(_("invalid argument: %s"), argv[optind]); + errtryhelp(EXIT_FAILURE); + } + if (!pid && argc - optind < 1) { + warnx(_("no PID or COMMAND specified")); + errtryhelp(EXIT_FAILURE); + } + if (optind < argc && !has_adj) { + warnx(_("no OOM score adjust value specified")); + errtryhelp(EXIT_FAILURE); + } + + pc = ul_new_path("/proc/%d", (int) (pid ? pid : getpid())); + + /* Show */ + if (!has_adj) { + printf(_("pid %d's current OOM score: %d\n"), pid, get_score(pc)); + printf(_("pid %d's current OOM score adjust value: %d\n"), pid, get_score_adj(pc)); + + /* Change */ + } else if (pid) { + int old = get_score_adj(pc); + + if (set_score_adj(pc, adj)) + err(EXIT_FAILURE, _("failed to set score adjust value")); + + printf(_("pid %d's OOM score adjust value changed from %d to %d\n"), pid, old, adj); + + /* Start new process */ + } else { + if (set_score_adj(pc, adj)) + err(EXIT_FAILURE, _("failed to set score adjust value")); + ul_unref_path(pc); + argv += optind; + execvp(argv[0], argv); + errexec(argv[0]); + } + + ul_unref_path(pc); + return EXIT_SUCCESS; +} diff --git a/sys-utils/ctrlaltdel.8 b/sys-utils/ctrlaltdel.8 new file mode 100644 index 0000000..a44ad19 --- /dev/null +++ b/sys-utils/ctrlaltdel.8 @@ -0,0 +1,58 @@ +.\" Copyright 1992, 1993 Rickard E. Faith (faith@cs.unc.edu) +.\" May be distributed under the GNU General Public License +.TH CTRLALTDEL 8 "October 2015" "util-linux" "System Administration" +.SH NAME +ctrlaltdel \- set the function of the Ctrl-Alt-Del combination +.SH SYNOPSIS +.BR "ctrlaltdel hard" | soft +.SH DESCRIPTION +Based on examination of the +.I linux/kernel/reboot.c +code, it is clear that there are two supported functions that the +Ctrl-Alt-Del sequence can perform. +.TP +.B hard +Immediately reboot the computer without calling +.BR sync (2) +and without any other preparation. This is the default. +.TP +.B soft +Make the kernel send the SIGINT (interrupt) signal to the +.B init +process (this is always the process with PID 1). If this option is used, +the +.BR init (8) +program must support this feature. Since there are now several +.BR init (8) +programs in the Linux community, please consult the documentation for the +version that you are currently using. +.PP +When the command is run without any argument, it will display the current +setting. +.PP +The function of +.B ctrlaltdel +is usually set in the +.I /etc/rc.local +file. +.SH OPTIONS +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.SH FILES +.I /etc/rc.local +.SH "SEE ALSO" +.BR init (8), +.BR systemd (1) +.SH AUTHOR +.UR poe@daimi.aau.dk +Peter Orbaek +.UE +.SH AVAILABILITY +The ctrlaltdel command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/ctrlaltdel.c b/sys-utils/ctrlaltdel.c new file mode 100644 index 0000000..ea662c4 --- /dev/null +++ b/sys-utils/ctrlaltdel.c @@ -0,0 +1,114 @@ +/* + * ctrlaltdel.c - Set the function of the Ctrl-Alt-Del combination + * Created 4-Jul-92 by Peter Orbaek <poe@daimi.aau.dk> + * 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + */ + +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/reboot.h> +#include "nls.h" +#include "c.h" +#include "closestream.h" +#include "pathnames.h" +#include "path.h" + +#define LINUX_REBOOT_CMD_CAD_ON 0x89ABCDEF +#define LINUX_REBOOT_CMD_CAD_OFF 0x00000000 + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s hard|soft\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fprintf(out, _("Set the function of the Ctrl-Alt-Del combination.\n")); + + fputs(USAGE_OPTIONS, out); + printf(USAGE_HELP_OPTIONS(16)); + printf(USAGE_MAN_TAIL("ctrlaltdel(8)")); + exit(EXIT_SUCCESS); +} + +static int get_cad(void) +{ + uint64_t val; + + if (ul_path_read_u64(NULL, &val, _PATH_PROC_CTRL_ALT_DEL) != 0) + err(EXIT_FAILURE, _("cannot read %s"), _PATH_PROC_CTRL_ALT_DEL); + + switch (val) { + case 0: + fputs("soft\n", stdout); + break; + case 1: + fputs("hard\n", stdout); + break; + default: + printf("%s hard\n", _("implicit")); + warnx(_("unexpected value in %s: %ju"), _PATH_PROC_CTRL_ALT_DEL, val); + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +static int set_cad(const char *arg) +{ + unsigned int cmd; + + if (geteuid()) { + warnx(_("You must be root to set the Ctrl-Alt-Del behavior")); + return EXIT_FAILURE; + } + if (!strcmp("hard", arg)) + cmd = LINUX_REBOOT_CMD_CAD_ON; + else if (!strcmp("soft", arg)) + cmd = LINUX_REBOOT_CMD_CAD_OFF; + else { + warnx(_("unknown argument: %s"), arg); + return EXIT_FAILURE; + } + if (reboot(cmd) < 0) { + warnx("reboot"); + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ch, ret; + static const struct option longopts[] = { + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((ch = getopt_long(argc, argv, "Vh", longopts, NULL)) != -1) + switch (ch) { + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + + if (argc < 2) + ret = get_cad(); + else + ret = set_cad(argv[1]); + return ret; +} diff --git a/sys-utils/dmesg.1 b/sys-utils/dmesg.1 new file mode 100644 index 0000000..a93821a --- /dev/null +++ b/sys-utils/dmesg.1 @@ -0,0 +1,256 @@ +.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu) +.\" May be distributed under the GNU General Public License +.TH DMESG "1" "July 2012" "util-linux" "User Commands" +.SH NAME +dmesg \- print or control the kernel ring buffer +.SH SYNOPSIS +.B dmesg +[options] +.sp +.B dmesg \-\-clear +.br +.BR "dmesg \-\-read\-clear " [options] +.br +.BI "dmesg \-\-console\-level " level +.br +.B dmesg \-\-console\-on +.br +.B dmesg \-\-console\-off +.SH DESCRIPTION +.B dmesg +is used to examine or control the kernel ring buffer. +.PP +The default action is to display all messages from the kernel ring buffer. +.SH OPTIONS +The +.BR \-\-clear , +.BR \-\-read\-clear , +.BR \-\-console\-on , +.BR \-\-console\-off , +and +.B \-\-console\-level +options are mutually exclusive. +.PP +.IP "\fB\-C\fR, \fB\-\-clear\fR" +Clear the ring buffer. +.IP "\fB\-c\fR, \fB\-\-read\-clear\fR" +Clear the ring buffer after first printing its contents. +.IP "\fB\-D\fR, \fB\-\-console\-off\fR" +Disable the printing of messages to the console. +.IP "\fB\-d\fR, \fB\-\-show\-delta\fR" +Display the timestamp and the time delta spent between messages. If used +together with +.B \-\-notime +then only the time delta without the timestamp is printed. +.IP "\fB\-E\fR, \fB\-\-console\-on\fR" +Enable printing messages to the console. +.IP "\fB\-e\fR, \fB\-\-reltime\fR" +Display the local time and the delta in human-readable format. Be aware that +conversion to the local time could be inaccurate (see \fB\-T\fR for more +details). +.IP "\fB\-F\fR, \fB\-\-file \fIfile\fR" +Read the syslog messages from the given +.IR file . +Note that \fB\-F\fR does not support messages in kmsg format. The old syslog format is supported only. +.IP "\fB\-f\fR, \fB\-\-facility \fIlist\fR" +Restrict output to the given (comma-separated) +.I list +of facilities. For example: +.PP +.RS 14 +.B dmesg \-\-facility=daemon +.RE +.IP +will print messages from system daemons only. For all supported facilities +see the +.B \-\-help +output. +.IP "\fB\-H\fR, \fB\-\-human\fR" +Enable human-readable output. See also \fB\-\-color\fR, \fB\-\-reltime\fR +and \fB\-\-nopager\fR. +.IP "\fB\-k\fR, \fB\-\-kernel\fR" +Print kernel messages. +.IP "\fB\-L\fR, \fB\-\-color\fR[=\fIwhen\fR]" +Colorize the output. The optional argument \fIwhen\fP +can be \fBauto\fR, \fBnever\fR or \fBalways\fR. If the \fIwhen\fR argument is omitted, +it defaults to \fBauto\fR. The colors can be disabled; for the current built-in default +see the \fB\-\-help\fR output. See also the \fBCOLORS\fR section below. +.IP "\fB\-l\fR, \fB\-\-level \fIlist\fR" +Restrict output to the given (comma-separated) +.I list +of levels. For example: +.PP +.RS 14 +.B dmesg \-\-level=err,warn +.RE +.IP +will print error and warning messages only. For all supported levels see the +.B \-\-help +output. +.IP "\fB\-n\fR, \fB\-\-console\-level \fIlevel\fR +Set the +.I level +at which printing of messages is done to the console. The +.I level +is a level number or abbreviation of the level name. For all supported +levels see the +.B \-\-help +output. +.sp +For example, +.B \-n 1 +or +.B \-n emerg +prevents all messages, except emergency (panic) messages, from appearing on +the console. All levels of messages are still written to +.IR /proc/kmsg , +so +.BR syslogd (8) +can still be used to control exactly where kernel messages appear. When the +.B \-n +option is used, +.B dmesg +will +.I not +print or clear the kernel ring buffer. +.IP "\fB\-P\fR, \fB\-\-nopager\fR" +Do not pipe output into a pager. A pager is enabled by default for \fB\-\-human\fR output. +.IP "\fB\-p\fR, \fB\-\-force\-prefix\fR" +Add facility, level or timestamp information to each line of a multi-line message. +.IP "\fB\-r\fR, \fB\-\-raw\fR" +Print the raw message buffer, i.e. do not strip the log-level prefixes. + +Note that the real raw format depends on the method how +.BR dmesg (1) +reads kernel messages. The /dev/kmsg device uses a different format than +.BR syslog (2). +For backward compatibility, +.BR dmesg (1) +returns data always in the +.BR syslog (2) +format. It is possible to read the real raw data from /dev/kmsg by, for example, +the command 'dd if=/dev/kmsg iflag=nonblock'. +.IP "\fB\-S\fR, \fB\-\-syslog\fR" +Force \fBdmesg\fR to use the +.BR syslog (2) +kernel interface to read kernel messages. The default is to use /dev/kmsg rather +than +.BR syslog (2) +since kernel 3.5.0. +.IP "\fB\-s\fR, \fB\-\-buffer\-size \fIsize\fR +Use a buffer of +.I size +to query the kernel ring buffer. This is 16392 by default. (The default +kernel syslog buffer size was 4096 at first, 8192 since 1.3.54, 16384 since +2.1.113.) If you have set the kernel buffer to be larger than the default, +then this option can be used to view the entire buffer. +.IP "\fB\-T\fR, \fB\-\-ctime\fR" +Print human-readable timestamps. +.IP +.B Be aware that the timestamp could be inaccurate! +The +.B time +source used for the logs is +.B not updated after +system +.BR SUSPEND / RESUME . +.IP "\fB\-t\fR, \fB\-\-notime\fR" +Do not print kernel's timestamps. +.IP "\fB\-\-time\-format\fR \fIformat\fR" +Print timestamps using the given \fIformat\fR, which can be +.BR ctime , +.BR reltime , +.B delta +or +.BR iso . +The first three formats are aliases of the time-format-specific options. +The +.B iso +format is a +.B dmesg +implementation of the ISO-8601 timestamp format. The purpose of this format is +to make the comparing of timestamps between two systems, and any other parsing, +easy. The definition of the \fBiso\fR timestamp is: +YYYY-MM-DD<T>HH:MM:SS,<microseconds><-+><timezone offset from UTC>. +.IP +The +.B iso +format has the same issue as +.BR ctime : +the time may be inaccurate when a system is suspended and resumed. +.TP +.BR \-u , " \-\-userspace" +Print userspace messages. +.TP +.BR \-w , " \-\-follow" +Wait for new messages. This feature is supported only on systems with +a readable /dev/kmsg (since kernel 3.5.0). +.TP +.BR \-x , " \-\-decode" +Decode facility and level (priority) numbers to human-readable prefixes. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH COLORS +Implicit coloring can be disabled by an empty file \fI/etc/terminal-colors.d/dmesg.disable\fR. +See +.BR terminal-colors.d (5) +for more details about colorization configuration. +.PP +The logical color names supported by +.B dmesg +are: +.TP +.B subsys +The message sub-system prefix (e.g. "ACPI:"). +.TP +.B time +The message timestamp. +.TP +.B timebreak +The message timestamp in short ctime format in \fB\-\-reltime\fR +or \fB\-\-human\fR output. +.TP +.B alert +The text of the message with the alert log priority. +.TP +.B crit +The text of the message with the critical log priority. +.TP +.B err +The text of the message with the error log priority. +.TP +.B warn +The text of the message with the warning log priority. +.TP +.B segfault +The text of the message that inform about segmentation fault. +.SH EXIT STATUS +.B dmesg +can fail reporting permission denied error. This is usually caused by +.B dmesg_restrict +kernel setting, please see +.BR syslog (2) +for more details. +.SH SEE ALSO +.BR terminal-colors.d (5), +.BR syslogd (8) +.SH AUTHORS +.MT kzak@redhat.com +Karel Zak +.ME + +.br +.B dmesg +was originally written by +.MT tytso@athena.mit.edu +Theodore Ts'o +.ME +.SH AVAILABILITY +The dmesg command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/dmesg.c b/sys-utils/dmesg.c new file mode 100644 index 0000000..ba4e225 --- /dev/null +++ b/sys-utils/dmesg.c @@ -0,0 +1,1547 @@ +/* + * dmesg.c -- Print out the contents of the kernel ring buffer + * + * Copyright (C) 1993 Theodore Ts'o <tytso@athena.mit.edu> + * Copyright (C) 2011 Karel Zak <kzak@redhat.com> + * + * This program comes with ABSOLUTELY NO WARRANTY. + */ +#include <stdio.h> +#include <getopt.h> +#include <stdlib.h> +#include <sys/klog.h> +#include <sys/syslog.h> +#include <sys/time.h> +#include <sys/sysinfo.h> +#include <ctype.h> +#include <time.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> + +#include "c.h" +#include "colors.h" +#include "nls.h" +#include "strutils.h" +#include "xalloc.h" +#include "widechar.h" +#include "all-io.h" +#include "bitops.h" +#include "closestream.h" +#include "optutils.h" +#include "timeutils.h" +#include "monotonic.h" +#include "mangle.h" +#include "pager.h" + +/* Close the log. Currently a NOP. */ +#define SYSLOG_ACTION_CLOSE 0 +/* Open the log. Currently a NOP. */ +#define SYSLOG_ACTION_OPEN 1 +/* Read from the log. */ +#define SYSLOG_ACTION_READ 2 +/* Read all messages remaining in the ring buffer. (allowed for non-root) */ +#define SYSLOG_ACTION_READ_ALL 3 +/* Read and clear all messages remaining in the ring buffer */ +#define SYSLOG_ACTION_READ_CLEAR 4 +/* Clear ring buffer. */ +#define SYSLOG_ACTION_CLEAR 5 +/* Disable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_OFF 6 +/* Enable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_ON 7 +/* Set level of messages printed to console */ +#define SYSLOG_ACTION_CONSOLE_LEVEL 8 +/* Return number of unread characters in the log buffer */ +#define SYSLOG_ACTION_SIZE_UNREAD 9 +/* Return size of the log buffer */ +#define SYSLOG_ACTION_SIZE_BUFFER 10 + +/* + * Color scheme + */ +struct dmesg_color { + const char *scheme; /* name used in termina-colors.d/dmesg.scheme */ + const char *dflt; /* default color ESC sequence */ +}; + +enum { + DMESG_COLOR_SUBSYS, + DMESG_COLOR_TIME, + DMESG_COLOR_TIMEBREAK, + DMESG_COLOR_ALERT, + DMESG_COLOR_CRIT, + DMESG_COLOR_ERR, + DMESG_COLOR_WARN, + DMESG_COLOR_SEGFAULT +}; + +static const struct dmesg_color colors[] = +{ + [DMESG_COLOR_SUBSYS] = { "subsys", UL_COLOR_BROWN }, + [DMESG_COLOR_TIME] = { "time", UL_COLOR_GREEN }, + [DMESG_COLOR_TIMEBREAK] = { "timebreak",UL_COLOR_GREEN UL_COLOR_BOLD }, + [DMESG_COLOR_ALERT] = { "alert", UL_COLOR_REVERSE UL_COLOR_RED }, + [DMESG_COLOR_CRIT] = { "crit", UL_COLOR_BOLD UL_COLOR_RED }, + [DMESG_COLOR_ERR] = { "err", UL_COLOR_RED }, + [DMESG_COLOR_WARN] = { "warn", UL_COLOR_BOLD }, + [DMESG_COLOR_SEGFAULT] = { "segfault", UL_COLOR_HALFBRIGHT UL_COLOR_RED } +}; + +#define dmesg_enable_color(_id) \ + color_scheme_enable(colors[_id].scheme, colors[_id].dflt); + +/* + * Priority and facility names + */ +struct dmesg_name { + const char *name; + const char *help; +}; + +/* + * Priority names -- based on sys/syslog.h + */ +static const struct dmesg_name level_names[] = +{ + [LOG_EMERG] = { "emerg", N_("system is unusable") }, + [LOG_ALERT] = { "alert", N_("action must be taken immediately") }, + [LOG_CRIT] = { "crit", N_("critical conditions") }, + [LOG_ERR] = { "err", N_("error conditions") }, + [LOG_WARNING] = { "warn", N_("warning conditions") }, + [LOG_NOTICE] = { "notice",N_("normal but significant condition") }, + [LOG_INFO] = { "info", N_("informational") }, + [LOG_DEBUG] = { "debug", N_("debug-level messages") } +}; + +/* + * sys/syslog.h uses (f << 3) for all facility codes. + * We want to use the codes as array indexes, so shift back... + * + * Note that libc LOG_FAC() macro returns the base codes, not the + * shifted code :-) + */ +#define FAC_BASE(f) ((f) >> 3) + +static const struct dmesg_name facility_names[] = +{ + [FAC_BASE(LOG_KERN)] = { "kern", N_("kernel messages") }, + [FAC_BASE(LOG_USER)] = { "user", N_("random user-level messages") }, + [FAC_BASE(LOG_MAIL)] = { "mail", N_("mail system") }, + [FAC_BASE(LOG_DAEMON)] = { "daemon", N_("system daemons") }, + [FAC_BASE(LOG_AUTH)] = { "auth", N_("security/authorization messages") }, + [FAC_BASE(LOG_SYSLOG)] = { "syslog", N_("messages generated internally by syslogd") }, + [FAC_BASE(LOG_LPR)] = { "lpr", N_("line printer subsystem") }, + [FAC_BASE(LOG_NEWS)] = { "news", N_("network news subsystem") }, + [FAC_BASE(LOG_UUCP)] = { "uucp", N_("UUCP subsystem") }, + [FAC_BASE(LOG_CRON)] = { "cron", N_("clock daemon") }, + [FAC_BASE(LOG_AUTHPRIV)] = { "authpriv", N_("security/authorization messages (private)") }, + [FAC_BASE(LOG_FTP)] = { "ftp", N_("FTP daemon") }, +}; + +/* supported methods to read message buffer + */ +enum { + DMESG_METHOD_KMSG, /* read messages from /dev/kmsg (default) */ + DMESG_METHOD_SYSLOG, /* klogctl() buffer */ + DMESG_METHOD_MMAP /* mmap file with records (see --file) */ +}; + +enum { + DMESG_TIMEFTM_NONE = 0, + DMESG_TIMEFTM_CTIME, /* [ctime] */ + DMESG_TIMEFTM_CTIME_DELTA, /* [ctime <delta>] */ + DMESG_TIMEFTM_DELTA, /* [<delta>] */ + DMESG_TIMEFTM_RELTIME, /* [relative] */ + DMESG_TIMEFTM_TIME, /* [time] */ + DMESG_TIMEFTM_TIME_DELTA, /* [time <delta>] */ + DMESG_TIMEFTM_ISO8601 /* 2013-06-13T22:11:00,123456+0100 */ +}; +#define is_timefmt(c, f) ((c)->time_fmt == (DMESG_TIMEFTM_ ##f)) + +struct dmesg_control { + /* bit arrays -- see include/bitops.h */ + char levels[ARRAY_SIZE(level_names) / NBBY + 1]; + char facilities[ARRAY_SIZE(facility_names) / NBBY + 1]; + + struct timeval lasttime; /* last printed timestamp */ + struct tm lasttm; /* last localtime */ + struct timeval boot_time; /* system boot time */ + + int action; /* SYSLOG_ACTION_* */ + int method; /* DMESG_METHOD_* */ + + size_t bufsize; /* size of syslog buffer */ + + int kmsg; /* /dev/kmsg file descriptor */ + ssize_t kmsg_first_read;/* initial read() return code */ + char kmsg_buf[BUFSIZ];/* buffer to read kmsg data */ + + /* + * For the --file option we mmap whole file. The unnecessary (already + * printed) pages are always unmapped. The result is that we have in + * memory only the currently used page(s). + */ + char *filename; + char *mmap_buff; + size_t pagesize; + unsigned int time_fmt; /* time format */ + + unsigned int follow:1, /* wait for new messages */ + raw:1, /* raw mode */ + fltr_lev:1, /* filter out by levels[] */ + fltr_fac:1, /* filter out by facilities[] */ + decode:1, /* use "facility: level: " prefix */ + pager:1, /* pipe output into a pager */ + color:1, /* colorize messages */ + force_prefix:1; /* force timestamp and decode prefix + on each line */ + int indent; /* due to timestamps if newline */ +}; + +struct dmesg_record { + const char *mesg; + size_t mesg_size; + + int level; + int facility; + struct timeval tv; + + const char *next; /* buffer with next unparsed record */ + size_t next_size; /* size of the next buffer */ +}; + +#define INIT_DMESG_RECORD(_r) do { \ + (_r)->mesg = NULL; \ + (_r)->mesg_size = 0; \ + (_r)->facility = -1; \ + (_r)->level = -1; \ + (_r)->tv.tv_sec = 0; \ + (_r)->tv.tv_usec = 0; \ + } while (0) + +static int read_kmsg(struct dmesg_control *ctl); + +static int set_level_color(int log_level, const char *mesg, size_t mesgsz) +{ + int id = -1; + + switch (log_level) { + case LOG_ALERT: + id = DMESG_COLOR_ALERT; + break; + case LOG_CRIT: + id = DMESG_COLOR_CRIT; + break; + case LOG_ERR: + id = DMESG_COLOR_ERR; + break; + case LOG_WARNING: + id = DMESG_COLOR_WARN; + break; + default: + break; + } + + /* well, sometimes the messages contains important keywords, but in + * non-warning/error messages + */ + if (id < 0 && memmem(mesg, mesgsz, "segfault at", 11)) + id = DMESG_COLOR_SEGFAULT; + + if (id >= 0) + dmesg_enable_color(id); + + return id >= 0 ? 0 : -1; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Display or control the kernel ring buffer.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -C, --clear clear the kernel ring buffer\n"), out); + fputs(_(" -c, --read-clear read and clear all messages\n"), out); + fputs(_(" -D, --console-off disable printing messages to console\n"), out); + fputs(_(" -E, --console-on enable printing messages to console\n"), out); + fputs(_(" -F, --file <file> use the file instead of the kernel log buffer\n"), out); + fputs(_(" -f, --facility <list> restrict output to defined facilities\n"), out); + fputs(_(" -H, --human human readable output\n"), out); + fputs(_(" -k, --kernel display kernel messages\n"), out); + fputs(_(" -L, --color[=<when>] colorize messages (auto, always or never)\n"), out); + fprintf(out, + " %s\n", USAGE_COLORS_DEFAULT); + fputs(_(" -l, --level <list> restrict output to defined levels\n"), out); + fputs(_(" -n, --console-level <level> set level of messages printed to console\n"), out); + fputs(_(" -P, --nopager do not pipe output into a pager\n"), out); + fputs(_(" -p, --force-prefix force timestamp output on each line of multi-line messages\n"), out); + fputs(_(" -r, --raw print the raw message buffer\n"), out); + fputs(_(" -S, --syslog force to use syslog(2) rather than /dev/kmsg\n"), out); + fputs(_(" -s, --buffer-size <size> buffer size to query the kernel ring buffer\n"), out); + fputs(_(" -u, --userspace display userspace messages\n"), out); + fputs(_(" -w, --follow wait for new messages\n"), out); + fputs(_(" -x, --decode decode facility and level to readable string\n"), out); + fputs(_(" -d, --show-delta show time delta between printed messages\n"), out); + fputs(_(" -e, --reltime show local time and time delta in readable format\n"), out); + fputs(_(" -T, --ctime show human-readable timestamp (may be inaccurate!)\n"), out); + fputs(_(" -t, --notime don't show any timestamp with messages\n"), out); + fputs(_(" --time-format <format> show timestamp using the given format:\n" + " [delta|reltime|ctime|notime|iso]\n" + "Suspending/resume will make ctime and iso timestamps inaccurate.\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(29)); + fputs(_("\nSupported log facilities:\n"), out); + for (i = 0; i < ARRAY_SIZE(level_names); i++) + fprintf(out, " %7s - %s\n", + facility_names[i].name, + _(facility_names[i].help)); + + fputs(_("\nSupported log levels (priorities):\n"), out); + for (i = 0; i < ARRAY_SIZE(level_names); i++) + fprintf(out, " %7s - %s\n", + level_names[i].name, + _(level_names[i].help)); + + printf(USAGE_MAN_TAIL("dmesg(1)")); + exit(EXIT_SUCCESS); +} + +/* + * LEVEL ::= <number> | <name> + * <number> ::= @len is set: number in range <0..N>, where N < ARRAY_SIZE(level_names) + * ::= @len not set: number in range <1..N>, where N <= ARRAY_SIZE(level_names) + * <name> ::= case-insensitive text + * + * Note that @len argument is not set when parsing "-n <level>" command line + * option. The console_level is interpreted as "log level less than the value". + * + * For example "dmesg -n 8" or "dmesg -n debug" enables debug console log + * level by klogctl(SYSLOG_ACTION_CONSOLE_LEVEL, NULL, 8). The @str argument + * has to be parsed to number in range <1..8>. + */ +static int parse_level(const char *str, size_t len) +{ + int offset = 0; + + if (!str) + return -1; + if (!len) { + len = strlen(str); + offset = 1; + } + errno = 0; + + if (isdigit(*str)) { + char *end = NULL; + long x = strtol(str, &end, 10) - offset; + + if (!errno && end && end > str && (size_t) (end - str) == len && + x >= 0 && (size_t) x < ARRAY_SIZE(level_names)) + return x + offset; + } else { + size_t i; + + for (i = 0; i < ARRAY_SIZE(level_names); i++) { + const char *n = level_names[i].name; + + if (strncasecmp(str, n, len) == 0 && *(n + len) == '\0') + return i + offset; + } + } + + if (errno) + err(EXIT_FAILURE, _("failed to parse level '%s'"), str); + + errx(EXIT_FAILURE, _("unknown level '%s'"), str); + return -1; +} + +/* + * FACILITY ::= <number> | <name> + * <number> ::= number in range <0..N>, where N < ARRAY_SIZE(facility_names) + * <name> ::= case-insensitive text + */ +static int parse_facility(const char *str, size_t len) +{ + if (!str) + return -1; + if (!len) + len = strlen(str); + errno = 0; + + if (isdigit(*str)) { + char *end = NULL; + long x = strtol(str, &end, 10); + + if (!errno && end && end > str && (size_t) (end - str) == len && + x >= 0 && (size_t) x < ARRAY_SIZE(facility_names)) + return x; + } else { + size_t i; + + for (i = 0; i < ARRAY_SIZE(facility_names); i++) { + const char *n = facility_names[i].name; + + if (strncasecmp(str, n, len) == 0 && *(n + len) == '\0') + return i; + } + } + + if (errno) + err(EXIT_FAILURE, _("failed to parse facility '%s'"), str); + + errx(EXIT_FAILURE, _("unknown facility '%s'"), str); + return -1; +} + +/* + * Parses numerical prefix used for all messages in kernel ring buffer. + * + * Priorities/facilities are encoded into a single 32-bit quantity, where the + * bottom 3 bits are the priority (0-7) and the top 28 bits are the facility + * (0-big number). + * + * Note that the number has to end with '>' or ',' char. + */ +static const char *parse_faclev(const char *str, int *fac, int *lev) +{ + long num; + char *end = NULL; + + if (!str) + return str; + + errno = 0; + num = strtol(str, &end, 10); + + if (!errno && end && end > str) { + *fac = LOG_FAC(num); + *lev = LOG_PRI(num); + + if (*lev < 0 || (size_t) *lev > ARRAY_SIZE(level_names)) + *lev = -1; + if (*fac < 0 || (size_t) *fac > ARRAY_SIZE(facility_names)) + *fac = -1; + return end + 1; /* skip '<' or ',' */ + } + + return str; +} + +/* + * Parses timestamp from syslog message prefix, expected format: + * + * seconds.microseconds] + * + * the ']' is the timestamp field terminator. + */ +static const char *parse_syslog_timestamp(const char *str0, struct timeval *tv) +{ + const char *str = str0; + char *end = NULL; + + if (!str0) + return str0; + + errno = 0; + tv->tv_sec = strtol(str, &end, 10); + + if (!errno && end && *end == '.' && *(end + 1)) { + str = end + 1; + end = NULL; + tv->tv_usec = strtol(str, &end, 10); + } + if (errno || !end || end == str || *end != ']') + return str0; + + return end + 1; /* skip ']' */ +} + +/* + * Parses timestamp from /dev/kmsg, expected formats: + * + * microseconds, + * microseconds; + * + * the ',' is fields separators and ';' items terminator (for the last item) + */ +static const char *parse_kmsg_timestamp(const char *str0, struct timeval *tv) +{ + const char *str = str0; + char *end = NULL; + uint64_t usec; + + if (!str0) + return str0; + + errno = 0; + usec = strtoumax(str, &end, 10); + + if (!errno && end && (*end == ';' || *end == ',')) { + tv->tv_usec = usec % 1000000; + tv->tv_sec = usec / 1000000; + } else + return str0; + + return end + 1; /* skip separator */ +} + + +static double time_diff(struct timeval *a, struct timeval *b) +{ + return (a->tv_sec - b->tv_sec) + (a->tv_usec - b->tv_usec) / 1E6; +} + +static int get_syslog_buffer_size(void) +{ + int n = klogctl(SYSLOG_ACTION_SIZE_BUFFER, NULL, 0); + + return n > 0 ? n : 0; +} + +/* + * Reads messages from regular file by mmap + */ +static ssize_t mmap_file_buffer(struct dmesg_control *ctl, char **buf) +{ + struct stat st; + int fd; + + if (!ctl->filename) + return -1; + + fd = open(ctl->filename, O_RDONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), ctl->filename); + if (fstat(fd, &st)) + err(EXIT_FAILURE, _("stat of %s failed"), ctl->filename); + + *buf = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (*buf == MAP_FAILED) + err(EXIT_FAILURE, _("cannot mmap: %s"), ctl->filename); + ctl->mmap_buff = *buf; + ctl->pagesize = getpagesize(); + close(fd); + + return st.st_size; +} + +/* + * Reads messages from kernel ring buffer by klogctl() + */ +static ssize_t read_syslog_buffer(struct dmesg_control *ctl, char **buf) +{ + size_t sz; + int rc = -1; + + if (ctl->bufsize) { + sz = ctl->bufsize + 8; + *buf = xmalloc(sz * sizeof(char)); + rc = klogctl(ctl->action, *buf, sz); + } else { + sz = 16392; + while (1) { + *buf = xmalloc(sz * sizeof(char)); + rc = klogctl(SYSLOG_ACTION_READ_ALL, *buf, sz); + if (rc < 0) + break; + if ((size_t) rc != sz || sz > (1 << 28)) + break; + free(*buf); + *buf = NULL; + sz *= 4; + } + + if (rc > 0 && ctl->action == SYSLOG_ACTION_READ_CLEAR) + rc = klogctl(SYSLOG_ACTION_READ_CLEAR, *buf, sz); + } + + return rc; +} + +/* + * Top level function to read messages + */ +static ssize_t read_buffer(struct dmesg_control *ctl, char **buf) +{ + ssize_t n = -1; + + switch (ctl->method) { + case DMESG_METHOD_MMAP: + n = mmap_file_buffer(ctl, buf); + break; + case DMESG_METHOD_SYSLOG: + if (!ctl->bufsize) + ctl->bufsize = get_syslog_buffer_size(); + + n = read_syslog_buffer(ctl, buf); + break; + case DMESG_METHOD_KMSG: + /* + * Since kernel 3.5.0 + */ + n = read_kmsg(ctl); + if (n == 0 && ctl->action == SYSLOG_ACTION_READ_CLEAR) + n = klogctl(SYSLOG_ACTION_CLEAR, NULL, 0); + break; + default: + abort(); /* impossible method -> drop core */ + } + + return n; +} + +static int fwrite_hex(const char *buf, size_t size, FILE *out) +{ + size_t i; + + for (i = 0; i < size; i++) { + int rc = fprintf(out, "\\x%02hhx", buf[i]); + if (rc < 0) + return rc; + } + return 0; +} + +/* + * Prints to 'out' and non-printable chars are replaced with \x<hex> sequences. + */ +static void safe_fwrite(const char *buf, size_t size, int indent, FILE *out) +{ + size_t i; +#ifdef HAVE_WIDECHAR + mbstate_t s; + memset(&s, 0, sizeof (s)); +#endif + for (i = 0; i < size; i++) { + const char *p = buf + i; + int rc, hex = 0; + size_t len; + +#ifdef HAVE_WIDECHAR + wchar_t wc; + len = mbrtowc(&wc, p, size - i, &s); + + if (len == 0) /* L'\0' */ + return; + + if (len == (size_t)-1 || len == (size_t)-2) { /* invalid sequence */ + memset(&s, 0, sizeof (s)); + len = hex = 1; + } else if (len > 1 && !iswprint(wc)) { /* non-printable multibyte */ + hex = 1; + } + i += len - 1; +#else + len = 1; + if (!isprint((unsigned char) *p) && + !isspace((unsigned char) *p)) /* non-printable */ + hex = 1; +#endif + if (hex) + rc = fwrite_hex(p, len, out); + else if (*p == '\n' && *(p + 1) && indent) { + rc = fwrite(p, 1, len, out) != len; + if (fprintf(out, "%*s", indent, "") != indent) + rc |= 1; + } + else + rc = fwrite(p, 1, len, out) != len; + if (rc != 0) { + if (errno != EPIPE) + err(EXIT_FAILURE, _("write failed")); + exit(EXIT_SUCCESS); + } + } +} + +static const char *skip_item(const char *begin, const char *end, const char *sep) +{ + while (begin < end) { + int c = *begin++; + + if (c == '\0' || strchr(sep, c)) + break; + } + + return begin; +} + +/* + * Parses one record from syslog(2) buffer + */ +static int get_next_syslog_record(struct dmesg_control *ctl, + struct dmesg_record *rec) +{ + size_t i; + const char *begin = NULL; + + if (ctl->method != DMESG_METHOD_MMAP && + ctl->method != DMESG_METHOD_SYSLOG) + return -1; + + if (!rec->next || !rec->next_size) + return 1; + + INIT_DMESG_RECORD(rec); + + /* + * Unmap already printed file data from memory + */ + if (ctl->mmap_buff && (size_t) (rec->next - ctl->mmap_buff) > ctl->pagesize) { + void *x = ctl->mmap_buff; + + ctl->mmap_buff += ctl->pagesize; + munmap(x, ctl->pagesize); + } + + for (i = 0; i < rec->next_size; i++) { + const char *p = rec->next + i; + const char *end = NULL; + + if (!begin) + begin = p; + if (i + 1 == rec->next_size) { + end = p + 1; + i++; + } else if (*p == '\n' && *(p + 1) == '<') + end = p; + + if (begin && !*begin) + begin = NULL; /* zero(s) at the end of the buffer? */ + if (!begin || !end) + continue; + if (end <= begin) + continue; /* error or empty line? */ + + if (*begin == '<') { + if (ctl->fltr_lev || ctl->fltr_fac || ctl->decode || ctl->color) + begin = parse_faclev(begin + 1, &rec->facility, + &rec->level); + else + begin = skip_item(begin, end, ">"); + } + + if (*begin == '[' && (*(begin + 1) == ' ' || + isdigit(*(begin + 1)))) { + + if (!is_timefmt(ctl, NONE)) + begin = parse_syslog_timestamp(begin + 1, &rec->tv); + else + begin = skip_item(begin, end, "]"); + + if (begin < end && *begin == ' ') + begin++; + } + + rec->mesg = begin; + rec->mesg_size = end - begin; + + /* Don't count \n from the last message to the message size */ + if (*end != '\n' && *(end - 1) == '\n') + rec->mesg_size--; + + rec->next_size -= end - rec->next; + rec->next = rec->next_size > 0 ? end + 1 : NULL; + if (rec->next_size > 0) + rec->next_size--; + + return 0; + } + + return 1; +} + +static int accept_record(struct dmesg_control *ctl, struct dmesg_record *rec) +{ + if (ctl->fltr_lev && (rec->facility < 0 || + !isset(ctl->levels, rec->level))) + return 0; + + if (ctl->fltr_fac && (rec->facility < 0 || + !isset(ctl->facilities, rec->facility))) + return 0; + + return 1; +} + +static void raw_print(struct dmesg_control *ctl, const char *buf, size_t size) +{ + int lastc = '\n'; + + if (!ctl->mmap_buff) { + /* + * Print whole ring buffer + */ + safe_fwrite(buf, size, 0, stdout); + lastc = buf[size - 1]; + } else { + /* + * Print file in small chunks to save memory + */ + while (size) { + size_t sz = size > ctl->pagesize ? ctl->pagesize : size; + char *x = ctl->mmap_buff; + + safe_fwrite(x, sz, 0, stdout); + lastc = x[sz - 1]; + size -= sz; + ctl->mmap_buff += sz; + munmap(x, sz); + } + } + + if (lastc != '\n') + putchar('\n'); +} + +static struct tm *record_localtime(struct dmesg_control *ctl, + struct dmesg_record *rec, + struct tm *tm) +{ + time_t t = ctl->boot_time.tv_sec + rec->tv.tv_sec; + return localtime_r(&t, tm); +} + +static char *record_ctime(struct dmesg_control *ctl, + struct dmesg_record *rec, + char *buf, size_t bufsiz) +{ + struct tm tm; + + record_localtime(ctl, rec, &tm); + + if (strftime(buf, bufsiz, "%a %b %e %H:%M:%S %Y", &tm) == 0) + *buf = '\0'; + return buf; +} + +static char *short_ctime(struct tm *tm, char *buf, size_t bufsiz) +{ + if (strftime(buf, bufsiz, "%b%e %H:%M", tm) == 0) + *buf = '\0'; + return buf; +} + +static char *iso_8601_time(struct dmesg_control *ctl, struct dmesg_record *rec, + char *buf, size_t bufsz) +{ + struct timeval tv = { + .tv_sec = ctl->boot_time.tv_sec + rec->tv.tv_sec, + .tv_usec = rec->tv.tv_usec + }; + + if (strtimeval_iso(&tv, ISO_TIMESTAMP_COMMA_T, buf, bufsz) != 0) + return NULL; + + return buf; +} + +static double record_count_delta(struct dmesg_control *ctl, + struct dmesg_record *rec) +{ + double delta = 0; + + if (timerisset(&ctl->lasttime)) + delta = time_diff(&rec->tv, &ctl->lasttime); + + ctl->lasttime = rec->tv; + return delta; +} + +static const char *get_subsys_delimiter(const char *mesg, size_t mesg_size) +{ + const char *p = mesg; + size_t sz = mesg_size; + + while (sz > 0) { + const char *d = strnchr(p, sz, ':'); + if (!d) + return NULL; + sz -= d - p + 1; + if (sz) { + if (isblank(*(d + 1))) + return d; + p = d + 1; + } + } + return NULL; +} + +static void print_record(struct dmesg_control *ctl, + struct dmesg_record *rec) +{ + char buf[128]; + char fpbuf[32] = "\0"; + char tsbuf[64] = "\0"; + size_t mesg_size = rec->mesg_size; + int timebreak = 0; + char *mesg_copy = NULL; + const char *line = NULL; + + if (!accept_record(ctl, rec)) + return; + + if (!rec->mesg_size) { + putchar('\n'); + return; + } + + /* + * Compose syslog(2) compatible raw output -- used for /dev/kmsg for + * backward compatibility with syslog(2) buffers only + */ + if (ctl->raw) { + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), + "<%d>[%5ld.%06ld] ", + LOG_MAKEPRI(rec->facility, rec->level), + (long) rec->tv.tv_sec, + (long) rec->tv.tv_usec); + goto full_output; + } + + /* Store decode information (facility & priority level) in a buffer */ + if (ctl->decode && + (rec->level > -1) && (rec->level < (int) ARRAY_SIZE(level_names)) && + (rec->facility > -1) && + (rec->facility < (int) ARRAY_SIZE(facility_names))) + snprintf(fpbuf, sizeof(fpbuf), "%-6s:%-6s: ", + facility_names[rec->facility].name, + level_names[rec->level].name); + + /* Store the timestamp in a buffer */ + switch (ctl->time_fmt) { + double delta; + struct tm cur; + case DMESG_TIMEFTM_NONE: + ctl->indent = 0; + break; + case DMESG_TIMEFTM_CTIME: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[%s] ", + record_ctime(ctl, rec, buf, sizeof(buf))); + break; + case DMESG_TIMEFTM_CTIME_DELTA: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[%s <%12.06f>] ", + record_ctime(ctl, rec, buf, sizeof(buf)), + record_count_delta(ctl, rec)); + break; + case DMESG_TIMEFTM_DELTA: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[<%12.06f>] ", + record_count_delta(ctl, rec)); + break; + case DMESG_TIMEFTM_RELTIME: + record_localtime(ctl, rec, &cur); + delta = record_count_delta(ctl, rec); + if (cur.tm_min != ctl->lasttm.tm_min || + cur.tm_hour != ctl->lasttm.tm_hour || + cur.tm_yday != ctl->lasttm.tm_yday) { + timebreak = 1; + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[%s] ", + short_ctime(&cur, buf, + sizeof(buf))); + } else { + if (delta < 10) + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), + "[ %+8.06f] ", delta); + else + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), + "[ %+9.06f] ", delta); + } + ctl->lasttm = cur; + break; + case DMESG_TIMEFTM_TIME: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[%5ld.%06ld] ", + (long)rec->tv.tv_sec, + (long)rec->tv.tv_usec); + break; + case DMESG_TIMEFTM_TIME_DELTA: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "[%5ld.%06ld <%12.06f>] ", + (long)rec->tv.tv_sec, + (long)rec->tv.tv_usec, + record_count_delta(ctl, rec)); + break; + case DMESG_TIMEFTM_ISO8601: + ctl->indent = snprintf(tsbuf, sizeof(tsbuf), "%s ", + iso_8601_time(ctl, rec, buf, + sizeof(buf))); + break; + default: + abort(); + } + + ctl->indent += strlen(fpbuf); + +full_output: + /* Output the decode information */ + if (*fpbuf) + fputs(fpbuf, stdout); + + /* Output the timestamp buffer */ + if (*tsbuf) { + /* Colorize the timestamp */ + if (ctl->color) + dmesg_enable_color(timebreak ? DMESG_COLOR_TIMEBREAK : + DMESG_COLOR_TIME); + if (ctl->time_fmt != DMESG_TIMEFTM_RELTIME) { + fputs(tsbuf, stdout); + } else { + /* + * For relative timestamping, the first line's + * timestamp is the offset and all other lines will + * report an offset of 0.000000. + */ + if (!line) + fputs(tsbuf, stdout); + else + printf("[ +0.000000] "); + } + if (ctl->color) + color_disable(); + } + + /* + * A kernel message may contain several lines of output, separated + * by '\n'. If the timestamp and decode outputs are forced then each + * line of the message must be displayed with that information. + */ + if (ctl->force_prefix) { + if (!line) { + mesg_copy = xstrdup(rec->mesg); + line = strtok(mesg_copy, "\n"); + mesg_size = strlen(line); + } + } else { + line = rec->mesg; + mesg_size = rec->mesg_size; + } + + /* Colorize kernel message output */ + if (ctl->color) { + /* Subsystem prefix */ + const char *subsys = get_subsys_delimiter(line, mesg_size); + int has_color = 0; + + if (subsys) { + dmesg_enable_color(DMESG_COLOR_SUBSYS); + safe_fwrite(line, subsys - line, ctl->indent, stdout); + color_disable(); + + mesg_size -= subsys - line; + line = subsys; + } + /* Error, alert .. etc. colors */ + has_color = set_level_color(rec->level, line, mesg_size) == 0; + safe_fwrite(line, mesg_size, ctl->indent, stdout); + if (has_color) + color_disable(); + } else + safe_fwrite(line, mesg_size, ctl->indent, stdout); + + /* Get the next line */ + if (ctl->force_prefix) { + line = strtok(NULL, "\n"); + if (line && *line) { + putchar('\n'); + mesg_size = strlen(line); + goto full_output; + } + free(mesg_copy); + } + + putchar('\n'); +} + +/* + * Prints the 'buf' kernel ring buffer; the messages are filtered out according + * to 'levels' and 'facilities' bitarrays. + */ +static void print_buffer(struct dmesg_control *ctl, + const char *buf, size_t size) +{ + struct dmesg_record rec = { .next = buf, .next_size = size }; + + if (ctl->raw) { + raw_print(ctl, buf, size); + return; + } + + while (get_next_syslog_record(ctl, &rec) == 0) + print_record(ctl, &rec); +} + +static ssize_t read_kmsg_one(struct dmesg_control *ctl) +{ + ssize_t size; + + /* kmsg returns EPIPE if record was modified while reading */ + do { + size = read(ctl->kmsg, ctl->kmsg_buf, + sizeof(ctl->kmsg_buf) - 1); + } while (size < 0 && errno == EPIPE); + + return size; +} + +static int init_kmsg(struct dmesg_control *ctl) +{ + int mode = O_RDONLY; + + if (!ctl->follow) + mode |= O_NONBLOCK; + else + setlinebuf(stdout); + + ctl->kmsg = open("/dev/kmsg", mode); + if (ctl->kmsg < 0) + return -1; + + /* + * Seek after the last record available at the time + * the last SYSLOG_ACTION_CLEAR was issued. + * + * ... otherwise SYSLOG_ACTION_CLEAR will have no effect for kmsg. + */ + lseek(ctl->kmsg, 0, SEEK_DATA); + + /* + * Old kernels (<3.5) allow to successfully open /dev/kmsg for + * read-only, but read() returns -EINVAL :-((( + * + * Let's try to read the first record. The record is later processed in + * read_kmsg(). + */ + ctl->kmsg_first_read = read_kmsg_one(ctl); + if (ctl->kmsg_first_read < 0) { + close(ctl->kmsg); + ctl->kmsg = -1; + return -1; + } + + return 0; +} + +/* + * /dev/kmsg record format: + * + * faclev,seqnum,timestamp[optional, ...];message\n + * TAGNAME=value + * ... + * + * - fields are separated by ',' + * - last field is terminated by ';' + * + */ +#define LAST_KMSG_FIELD(s) (!s || !*s || *(s - 1) == ';') + +static int parse_kmsg_record(struct dmesg_control *ctl, + struct dmesg_record *rec, + char *buf, + size_t sz) +{ + const char *p = buf, *end; + + if (sz == 0 || !buf || !*buf) + return -1; + + end = buf + (sz - 1); + INIT_DMESG_RECORD(rec); + + while (p < end && isspace(*p)) + p++; + + /* A) priority and facility */ + if (ctl->fltr_lev || ctl->fltr_fac || ctl->decode || + ctl->raw || ctl->color) + p = parse_faclev(p, &rec->facility, &rec->level); + else + p = skip_item(p, end, ","); + if (LAST_KMSG_FIELD(p)) + goto mesg; + + /* B) sequence number */ + p = skip_item(p, end, ",;"); + if (LAST_KMSG_FIELD(p)) + goto mesg; + + /* C) timestamp */ + if (is_timefmt(ctl, NONE)) + p = skip_item(p, end, ",;"); + else + p = parse_kmsg_timestamp(p, &rec->tv); + if (LAST_KMSG_FIELD(p)) + goto mesg; + + /* D) optional fields (ignore) */ + p = skip_item(p, end, ";"); + +mesg: + /* E) message text */ + rec->mesg = p; + p = skip_item(p, end, "\n"); + if (!p) + return -1; + + /* The message text is terminated by \n, but it's possible that the + * message contains another stuff behind this linebreak; in this case + * the previous skip_item() returns pointer to the stuff behind \n. + * Let's normalize all these situations and make sure we always point to + * the \n. + * + * Note that the next unhexmangle_to_buffer() will replace \n by \0. + */ + if (*p && *p != '\n') + p--; + + /* + * Kernel escapes non-printable characters, unfortunately kernel + * definition of "non-printable" is too strict. On UTF8 console we can + * print many chars, so let's decode from kernel. + */ + rec->mesg_size = unhexmangle_to_buffer(rec->mesg, + (char *) rec->mesg, p - rec->mesg + 1); + + rec->mesg_size--; /* don't count \0 */ + + /* F) message tags (ignore) */ + + return 0; +} + +/* + * Note that each read() call for /dev/kmsg returns always one record. It means + * that we don't have to read whole message buffer before the records parsing. + * + * So this function does not compose one huge buffer (like read_syslog_buffer()) + * and print_buffer() is unnecessary. All is done in this function. + * + * Returns 0 on success, -1 on error. + */ +static int read_kmsg(struct dmesg_control *ctl) +{ + struct dmesg_record rec; + ssize_t sz; + + if (ctl->method != DMESG_METHOD_KMSG || ctl->kmsg < 0) + return -1; + + /* + * The very first read() call is done in kmsg_init() where we test + * /dev/kmsg usability. The return code from the initial read() is + * stored in ctl->kmsg_first_read; + */ + sz = ctl->kmsg_first_read; + + while (sz > 0) { + *(ctl->kmsg_buf + sz) = '\0'; /* for debug messages */ + + if (parse_kmsg_record(ctl, &rec, + ctl->kmsg_buf, (size_t) sz) == 0) + print_record(ctl, &rec); + + sz = read_kmsg_one(ctl); + } + + return 0; +} + +static int which_time_format(const char *s) +{ + if (!strcmp(s, "notime")) + return DMESG_TIMEFTM_NONE; + if (!strcmp(s, "ctime")) + return DMESG_TIMEFTM_CTIME; + if (!strcmp(s, "delta")) + return DMESG_TIMEFTM_DELTA; + if (!strcmp(s, "reltime")) + return DMESG_TIMEFTM_RELTIME; + if (!strcmp(s, "iso")) + return DMESG_TIMEFTM_ISO8601; + errx(EXIT_FAILURE, _("unknown time format: %s"), s); +} + +#ifdef TEST_DMESG +static inline int dmesg_get_boot_time(struct timeval *tv) +{ + char *str = getenv("DMESG_TEST_BOOTIME"); + uintmax_t sec, usec; + + if (str && sscanf(str, "%ju.%ju", &sec, &usec) == 2) { + tv->tv_sec = sec; + tv->tv_usec = usec; + return tv->tv_sec >= 0 && tv->tv_usec >= 0 ? 0 : -EINVAL; + } + + return get_boot_time(tv); +} +#else +# define dmesg_get_boot_time get_boot_time +#endif + +int main(int argc, char *argv[]) +{ + char *buf = NULL; + int c, nopager = 0; + int console_level = 0; + int klog_rc = 0; + int delta = 0; + ssize_t n; + static struct dmesg_control ctl = { + .filename = NULL, + .action = SYSLOG_ACTION_READ_ALL, + .method = DMESG_METHOD_KMSG, + .kmsg = -1, + .time_fmt = DMESG_TIMEFTM_TIME, + .indent = 0, + }; + int colormode = UL_COLORMODE_UNDEF; + enum { + OPT_TIME_FORMAT = CHAR_MAX + 1, + }; + + static const struct option longopts[] = { + { "buffer-size", required_argument, NULL, 's' }, + { "clear", no_argument, NULL, 'C' }, + { "color", optional_argument, NULL, 'L' }, + { "console-level", required_argument, NULL, 'n' }, + { "console-off", no_argument, NULL, 'D' }, + { "console-on", no_argument, NULL, 'E' }, + { "decode", no_argument, NULL, 'x' }, + { "file", required_argument, NULL, 'F' }, + { "facility", required_argument, NULL, 'f' }, + { "follow", no_argument, NULL, 'w' }, + { "human", no_argument, NULL, 'H' }, + { "help", no_argument, NULL, 'h' }, + { "kernel", no_argument, NULL, 'k' }, + { "level", required_argument, NULL, 'l' }, + { "syslog", no_argument, NULL, 'S' }, + { "raw", no_argument, NULL, 'r' }, + { "read-clear", no_argument, NULL, 'c' }, + { "reltime", no_argument, NULL, 'e' }, + { "show-delta", no_argument, NULL, 'd' }, + { "ctime", no_argument, NULL, 'T' }, + { "notime", no_argument, NULL, 't' }, + { "nopager", no_argument, NULL, 'P' }, + { "userspace", no_argument, NULL, 'u' }, + { "version", no_argument, NULL, 'V' }, + { "time-format", required_argument, NULL, OPT_TIME_FORMAT }, + { "force-prefix", no_argument, NULL, 'p' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'C','D','E','c','n','r' }, /* clear,off,on,read-clear,level,raw*/ + { 'H','r' }, /* human, raw */ + { 'L','r' }, /* color, raw */ + { 'S','w' }, /* syslog,follow */ + { 'T','r' }, /* ctime, raw */ + { 'd','r' }, /* delta, raw */ + { 'e','r' }, /* reltime, raw */ + { 'r','x' }, /* raw, decode */ + { 'r','t' }, /* notime, raw */ + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "CcDdEeF:f:HhkL::l:n:iPprSs:TtuVwx", + longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'C': + ctl.action = SYSLOG_ACTION_CLEAR; + break; + case 'c': + ctl.action = SYSLOG_ACTION_READ_CLEAR; + break; + case 'D': + ctl.action = SYSLOG_ACTION_CONSOLE_OFF; + break; + case 'd': + delta = 1; + break; + case 'E': + ctl.action = SYSLOG_ACTION_CONSOLE_ON; + break; + case 'e': + ctl.time_fmt = DMESG_TIMEFTM_RELTIME; + break; + case 'F': + ctl.filename = optarg; + ctl.method = DMESG_METHOD_MMAP; + break; + case 'f': + ctl.fltr_fac = 1; + if (string_to_bitarray(optarg, + ctl.facilities, parse_facility) < 0) + return EXIT_FAILURE; + break; + case 'H': + ctl.time_fmt = DMESG_TIMEFTM_RELTIME; + colormode = UL_COLORMODE_AUTO; + ctl.pager = 1; + break; + case 'h': + usage(); + break; + case 'k': + ctl.fltr_fac = 1; + setbit(ctl.facilities, FAC_BASE(LOG_KERN)); + break; + case 'L': + colormode = UL_COLORMODE_AUTO; + if (optarg) + colormode = colormode_or_err(optarg, + _("unsupported color mode")); + break; + case 'l': + ctl.fltr_lev= 1; + if (string_to_bitarray(optarg, + ctl.levels, parse_level) < 0) + return EXIT_FAILURE; + break; + case 'n': + ctl.action = SYSLOG_ACTION_CONSOLE_LEVEL; + console_level = parse_level(optarg, 0); + break; + case 'P': + nopager = 1; + break; + case 'p': + ctl.force_prefix = 1; + break; + case 'r': + ctl.raw = 1; + break; + case 'S': + ctl.method = DMESG_METHOD_SYSLOG; + break; + case 's': + ctl.bufsize = strtou32_or_err(optarg, + _("invalid buffer size argument")); + if (ctl.bufsize < 4096) + ctl.bufsize = 4096; + break; + case 'T': + ctl.time_fmt = DMESG_TIMEFTM_CTIME; + break; + case 't': + ctl.time_fmt = DMESG_TIMEFTM_NONE; + break; + case 'u': + ctl.fltr_fac = 1; + for (n = 1; (size_t) n < ARRAY_SIZE(facility_names); n++) + setbit(ctl.facilities, n); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'w': + ctl.follow = 1; + break; + case 'x': + ctl.decode = 1; + break; + case OPT_TIME_FORMAT: + ctl.time_fmt = which_time_format(optarg); + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (argc != optind) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + if ((is_timefmt(&ctl, RELTIME) || + is_timefmt(&ctl, CTIME) || + is_timefmt(&ctl, ISO8601)) + && dmesg_get_boot_time(&ctl.boot_time) != 0) + ctl.time_fmt = DMESG_TIMEFTM_NONE; + + if (delta) + switch (ctl.time_fmt) { + case DMESG_TIMEFTM_CTIME: + ctl.time_fmt = DMESG_TIMEFTM_CTIME_DELTA; + break; + case DMESG_TIMEFTM_TIME: + ctl.time_fmt = DMESG_TIMEFTM_TIME_DELTA; + break; + case DMESG_TIMEFTM_ISO8601: + warnx(_("--show-delta is ignored when used together with iso8601 time format")); + break; + default: + ctl.time_fmt = DMESG_TIMEFTM_DELTA; + } + + + ctl.color = colors_init(colormode, "dmesg") ? 1 : 0; + if (ctl.follow) + nopager = 1; + ctl.pager = nopager ? 0 : ctl.pager; + if (ctl.pager) + pager_redirect(); + + switch (ctl.action) { + case SYSLOG_ACTION_READ_ALL: + case SYSLOG_ACTION_READ_CLEAR: + if (ctl.method == DMESG_METHOD_KMSG && init_kmsg(&ctl) != 0) + ctl.method = DMESG_METHOD_SYSLOG; + + if (ctl.raw + && ctl.method != DMESG_METHOD_KMSG + && (ctl.fltr_lev || ctl.fltr_fac)) + errx(EXIT_FAILURE, _("--raw can be used together with --level or " + "--facility only when reading messages from /dev/kmsg")); + + /* only kmsg supports multi-line messages */ + if (ctl.force_prefix && ctl.method != DMESG_METHOD_KMSG) + ctl.force_prefix = 0; + + if (ctl.pager) + pager_redirect(); + n = read_buffer(&ctl, &buf); + if (n > 0) + print_buffer(&ctl, buf, n); + if (!ctl.mmap_buff) + free(buf); + if (n < 0) + err(EXIT_FAILURE, _("read kernel buffer failed")); + if (ctl.kmsg >= 0) + close(ctl.kmsg); + break; + case SYSLOG_ACTION_CLEAR: + case SYSLOG_ACTION_CONSOLE_OFF: + case SYSLOG_ACTION_CONSOLE_ON: + klog_rc = klogctl(ctl.action, NULL, 0); + break; + case SYSLOG_ACTION_CONSOLE_LEVEL: + klog_rc = klogctl(ctl.action, NULL, console_level); + break; + default: + errx(EXIT_FAILURE, _("unsupported command")); + break; + } + + + if (klog_rc) + err(EXIT_FAILURE, _("klogctl failed")); + + return EXIT_SUCCESS; +} diff --git a/sys-utils/eject.1 b/sys-utils/eject.1 new file mode 100644 index 0000000..f901b23 --- /dev/null +++ b/sys-utils/eject.1 @@ -0,0 +1,187 @@ +.\" Copyright (C) 1994-2005 Jeff Tranter (tranter@pobox.com) +.\" Copyright (C) 2012 Karel Zak <kzak@redhat.com> +.\" +.\" It may be distributed under the GNU Public License, version 2, or +.\" any higher version. See section COPYING of the GNU Public license +.\" for conditions under which this file may be redistributed. +.TH EJECT 1 "April 2012" "Linux" "User Commands" +.SH NAME +eject \- eject removable media +.SH SYNOPSIS +.B eject +[options] +.IR device | mountpoint +.SH DESCRIPTION +.B eject +allows removable media (typically a CD-ROM, floppy disk, tape, JAZ, ZIP or USB +disk) to be ejected under software control. The command can also control some +multi-disc CD-ROM changers, the auto-eject feature supported by some devices, +and close the disc tray of some CD-ROM drives. +.PP +The device corresponding to \fIdevice\fP or \fImountpoint\fP is ejected. If no +name is specified, the default name \fB/dev/cdrom\fR is used. The device may be +addressed by device name (e.g. 'sda'), device path (e.g. '/dev/sda'), +UUID=\fIuuid\fR or LABEL=\fIlabel\fR tags. +.PP +There are four different methods of ejecting, depending on whether the device +is a CD-ROM, SCSI device, removable floppy, or tape. By default \fBeject\fR tries +all four methods in order until it succeeds. +.PP +If a device partition is specified, the whole-disk device is used. If the device +or a device partition is currently mounted, it is unmounted before ejecting. +.SH OPTIONS +.TP +.BR \-a , " \-\-auto on" | off +This option controls the auto-eject mode, supported by some devices. When +enabled, the drive automatically ejects when the device is closed. +.TP +.BR \-c , " \-\-changerslot " \fIslot +With this option a CD slot can be selected from an ATAPI/IDE CD-ROM changer. +The CD-ROM drive cannot be in use (mounted data CD or playing a music CD) for +a change request to work. Please also note that the first slot of the changer +is referred to as 0, not 1. +.TP +.BR \-d , " \-\-default" +List the default device name. +.TP +.BR \-F , " \-\-force" +Force eject, don't check device type. +.TP +.BR \-f , " \-\-floppy" +This option specifies that the drive should be ejected using a removable floppy +disk eject command. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.TP +.BR \-i , " \-\-manualeject on" | off +This option controls locking of the hardware eject button. When enabled, the +drive will not be ejected when the button is pressed. This is useful when you +are carrying a laptop in a bag or case and don't want it to eject if the button +is inadvertently pressed. +.TP +.BR \-M , " \-\-no\-partitions\-unmount" +The option tells eject to not try to unmount other partitions on partitioned +devices. If another partition is still mounted, the program will not attempt +to eject the media. It will attempt to unmount only the device or mountpoint +given on the command line. +.TP +.BR \-m , " \-\-no\-unmount" +The option tells eject to not try to unmount at all. +.TP +.BR \-n , " \-\-noop" +With this option the selected device is displayed but no action is performed. +.TP +.BR \-p , " \-\-proc" +This option allows you to use /proc/mounts instead /etc/mtab. It also passes the +\fB\-n\fR option to \fBumount\fR(8). +.TP +.BR \-q , " \-\-tape" +This option specifies that the drive should be ejected using a tape drive +offline command. +.TP +.BR \-r , " \-\-cdrom" +This option specifies that the drive should be ejected using a CDROM eject +command. +.TP +.BR \-s , " \-\-scsi" +This option specifies that the drive should be ejected using SCSI commands. +.TP +.BR \-T , " \-\-traytoggle" +With this option the drive is given a CD-ROM tray close command if it's opened, +and a CD-ROM tray eject command if it's closed. Not all devices support this +command, because it uses the above CD-ROM tray close command. +.TP +.BR \-t , " \-\-trayclose" +With this option the drive is given a CD-ROM tray close command. Not all +devices support this command. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-v , " \-\-verbose" +Run in verbose mode; more information is displayed about what the command is +doing. +.TP +.BR \-X , " \-\-listspeed" +With this option the CD-ROM drive will be probed to detect the available +speeds. The output is a list of speeds which can be used as an argument of the +\fB\-x\fR option. This only works with Linux 2.6.13 or higher, on previous versions +solely the maximum speed will be reported. Also note that some drives may not +correctly report the speed and therefore this option does not work with them. +.TP +.BR \-x , " \-\-cdspeed " \fIspeed +With this option the drive is given a CD-ROM select speed command. The +.I speed +argument is a number indicating the desired speed (e.g. 8 for 8X speed), or 0 +for maximum data rate. Not all devices support this command and you can only +specify speeds that the drive is capable of. Every time the media is changed +this option is cleared. This option can be used alone, or with the +\fB\-t\fR and \fB\-c\fR options. +.SH EXIT STATUS +Returns 0 if operation was successful, 1 if operation failed or command syntax +was not valid. +.SH NOTES +.B eject +only works with devices that support one or more of the four methods of +ejecting. This includes most CD-ROM drives (IDE, SCSI, and proprietary), some +SCSI tape drives, JAZ drives, ZIP drives (parallel port, SCSI, and IDE +versions), and LS120 removable floppies. Users have also reported success with +floppy drives on Sun SPARC and Apple Macintosh systems. If +.B eject +does not work, it is most likely a limitation of the kernel driver for the +device and not the +.B eject +program itself. +.PP +The \fB\-r\fR, \fB\-s\fR, \fB\-f\fR, and \fB\-q\fR options allow controlling +which methods are used to +eject. More than one method can be specified. If none of these options are +specified, it tries all four (this works fine in most cases). +.PP +.B eject +may not always be able to determine if the device is mounted (e.g. if it has +several names). If the device name is a symbolic link, +.B eject +will follow the link and use the device that it points to. +.PP +If +.B eject +determines that the device can have multiple partitions, it will attempt to +unmount all mounted partitions of the device before ejecting (see also +\fB--no-partitions-unmount\fR). If an unmount fails, the program will not +attempt to eject the media. +.PP +You can eject an audio CD. Some CD-ROM drives will refuse to open the tray if +the drive is empty. Some devices do not support the tray close command. +.PP +If the auto-eject feature is enabled, then the drive will always be ejected +after running this command. Not all Linux kernel CD-ROM drivers support the +auto-eject mode. There is no way to find out the state of the auto-eject mode. +.PP +You need appropriate privileges to access the device files. Running as root is +required to eject some devices (e.g. SCSI devices). +.SH AUTHORS +.MT tranter@\:pobox.com +Jeff Tranter +.ME +- original author. +.br +.MT kzak@\:redhat.com +Karel Zak +.ME +and +.MT mluscon@\:redhat.com +Michal Luscon +.ME +- util-linux version. +.SH SEE ALSO +.BR findmnt (8), +.BR lsblk (8), +.BR mount (8), +.BR umount (8) +.SH AVAILABILITY +The eject command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/eject.c b/sys-utils/eject.c new file mode 100644 index 0000000..8196b60 --- /dev/null +++ b/sys-utils/eject.c @@ -0,0 +1,1044 @@ +/* + * Copyright (C) 1994-2005 Jeff Tranter (tranter@pobox.com) + * Copyright (C) 2012 Karel Zak <kzak@redhat.com> + * Copyright (C) Michal Luscon <mluscon@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <limits.h> +#include <err.h> +#include <stdarg.h> + +#include <getopt.h> +#include <errno.h> +#include <regex.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <sys/mtio.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#include <sys/mount.h> +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <scsi/scsi_ioctl.h> +#include <sys/time.h> + +#include <libmount.h> + +#include "c.h" +#include "closestream.h" +#include "nls.h" +#include "strutils.h" +#include "xalloc.h" +#include "pathnames.h" +#include "sysfs.h" +#include "monotonic.h" + +/* + * sg_io_hdr_t driver_status -- see kernel include/scsi/scsi.h + */ +#ifndef DRIVER_SENSE +# define DRIVER_SENSE 0x08 +#endif + + +#define EJECT_DEFAULT_DEVICE "/dev/cdrom" + + +/* Used by the toggle_tray() function. If ejecting the tray takes this + * time or less, the tray was probably already ejected, so we close it + * again. + */ +#define TRAY_WAS_ALREADY_OPEN_USECS 200000 /* about 0.2 seconds */ + +struct eject_control { + struct libmnt_table *mtab; + char *device; /* device or mount point to be ejected */ + int fd; /* file descriptor for device */ + unsigned int /* command flags and arguments */ + a_option:1, + c_option:1, + d_option:1, + F_option:1, + f_option:1, + i_option:1, + M_option:1, + m_option:1, + n_option:1, + p_option:1, + q_option:1, + r_option:1, + s_option:1, + T_option:1, + t_option:1, + v_option:1, + X_option:1, + x_option:1, + a_arg:1, + i_arg:1; + long int c_arg; /* changer slot number */ + long int x_arg; /* cd speed */ +}; + +static void vinfo(const char *fmt, va_list va) +{ + fprintf(stdout, "%s: ", program_invocation_short_name); + vprintf(fmt, va); + fputc('\n', stdout); +} + +static inline void verbose(const struct eject_control *ctl, const char *fmt, ...) +{ + va_list va; + + if (!ctl->v_option) + return; + + va_start(va, fmt); + vinfo(fmt, va); + va_end(va); +} + +static inline void info(const char *fmt, ...) +{ + va_list va; + va_start(va, fmt); + vinfo(fmt, va); + va_end(va); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] [<device>|<mountpoint>]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Eject removable media.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --auto <on|off> turn auto-eject feature on or off\n" + " -c, --changerslot <slot> switch discs on a CD-ROM changer\n" + " -d, --default display default device\n" + " -f, --floppy eject floppy\n" + " -F, --force don't care about device type\n" + " -i, --manualeject <on|off> toggle manual eject protection on/off\n" + " -m, --no-unmount do not unmount device even if it is mounted\n" + " -M, --no-partitions-unmount do not unmount another partitions\n" + " -n, --noop don't eject, just show device found\n" + " -p, --proc use /proc/mounts instead of /etc/mtab\n" + " -q, --tape eject tape\n" + " -r, --cdrom eject CD-ROM\n" + " -s, --scsi eject SCSI device\n" + " -t, --trayclose close tray\n" + " -T, --traytoggle toggle tray\n" + " -v, --verbose enable verbose output\n" + " -x, --cdspeed <speed> set CD-ROM max speed\n" + " -X, --listspeed list CD-ROM available speeds\n"), + out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(29)); + + fputs(_("\nBy default tries -r, -s, -f, and -q in order until success.\n"), out); + printf(USAGE_MAN_TAIL("eject(1)")); + + exit(EXIT_SUCCESS); +} + + +/* Handle command line options. */ +static void parse_args(struct eject_control *ctl, int argc, char **argv) +{ + static const struct option long_opts[] = + { + {"auto", required_argument, NULL, 'a'}, + {"cdrom", no_argument, NULL, 'r'}, + {"cdspeed", required_argument, NULL, 'x'}, + {"changerslot", required_argument, NULL, 'c'}, + {"default", no_argument, NULL, 'd'}, + {"floppy", no_argument, NULL, 'f'}, + {"force", no_argument, NULL, 'F'}, + {"help", no_argument, NULL, 'h'}, + {"listspeed", no_argument, NULL, 'X'}, + {"manualeject", required_argument, NULL, 'i'}, + {"noop", no_argument, NULL, 'n'}, + {"no-unmount", no_argument, NULL, 'm'}, + {"no-partitions-unmount", no_argument, NULL, 'M' }, + {"proc", no_argument, NULL, 'p'}, + {"scsi", no_argument, NULL, 's'}, + {"tape", no_argument, NULL, 'q'}, + {"trayclose", no_argument, NULL, 't'}, + {"traytoggle", no_argument, NULL, 'T'}, + {"verbose", no_argument, NULL, 'v'}, + {"version", no_argument, NULL, 'V'}, + {NULL, 0, NULL, 0} + }; + int c; + + while ((c = getopt_long(argc, argv, + "a:c:i:x:dfFhnqrstTXvVpmM", long_opts, NULL)) != -1) { + switch (c) { + case 'a': + ctl->a_option = 1; + ctl->a_arg = parse_switch(optarg, _("argument error"), + "on", "off", "1", "0", NULL); + break; + case 'c': + ctl->c_option = 1; + ctl->c_arg = strtoul_or_err(optarg, _("invalid argument to --changerslot/-c option")); + break; + case 'x': + ctl->x_option = 1; + ctl->x_arg = strtoul_or_err(optarg, _("invalid argument to --cdspeed/-x option")); + break; + case 'd': + ctl->d_option = 1; + break; + case 'f': + ctl->f_option = 1; + break; + case 'F': + ctl->F_option = 1; + break; + case 'h': + usage(); + break; + case 'i': + ctl->i_option = 1; + ctl->i_arg = parse_switch(optarg, _("argument error"), + "on", "off", "1", "0", NULL); + break; + case 'm': + ctl->m_option = 1; + break; + case 'M': + ctl->M_option = 1; + break; + case 'n': + ctl->n_option = 1; + break; + case 'p': + ctl->p_option = 1; + break; + case 'q': + ctl->q_option = 1; + break; + case 'r': + ctl->r_option = 1; + break; + case 's': + ctl->s_option = 1; + break; + case 't': + ctl->t_option = 1; + break; + case 'T': + ctl->T_option = 1; + break; + case 'X': + ctl->X_option = 1; + break; + case 'v': + ctl->v_option = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + exit(EXIT_SUCCESS); + break; + default: + errtryhelp(EXIT_FAILURE); + break; + } + } + + /* check for a single additional argument */ + if ((argc - optind) > 1) + errx(EXIT_FAILURE, _("too many arguments")); + + if ((argc - optind) == 1) + ctl->device = xstrdup(argv[optind]); +} + +/* + * Given name, such as foo, see if any of the following exist: + * + * foo (if foo starts with '.' or '/') + * /dev/foo + * + * If found, return the full path. If not found, return 0. + * Returns pointer to dynamically allocated string. + */ +static char *find_device(const char *name) +{ + if (!name) + return NULL; + + if ((*name == '.' || *name == '/') && access(name, F_OK) == 0) + return xstrdup(name); + else { + char buf[PATH_MAX]; + + snprintf(buf, sizeof(buf), "/dev/%s", name); + if (access(buf, F_OK) == 0) + return xstrdup(buf); + } + + return NULL; +} + +/* Set or clear auto-eject mode. */ +static void auto_eject(const struct eject_control *ctl) +{ + int status = -1; + +#if defined(CDROM_SET_OPTIONS) && defined(CDROM_CLEAR_OPTIONS) + if (ctl->a_arg) + status = ioctl(ctl->fd, CDROM_SET_OPTIONS, CDO_AUTO_EJECT); + else + status = ioctl(ctl->fd, CDROM_CLEAR_OPTIONS, CDO_AUTO_EJECT); +#else + errno = ENOSYS; +#endif + if (status < 0) + err(EXIT_FAILURE,_("CD-ROM auto-eject command failed")); +} + +/* + * Stops CDROM from opening on manual eject button press. + * This can be useful when you carry your laptop + * in your bag while it's on and no CD inserted in it's drive. + * Implemented as found in Documentation/ioctl/cdrom.txt + */ +static void manual_eject(const struct eject_control *ctl) +{ + if (ioctl(ctl->fd, CDROM_LOCKDOOR, ctl->i_arg) < 0) { + switch (errno) { + case EDRIVE_CANT_DO_THIS: + errx(EXIT_FAILURE, _("CD-ROM door lock is not supported")); + case EBUSY: + errx(EXIT_FAILURE, _("other users have the drive open and not CAP_SYS_ADMIN")); + default: + err(EXIT_FAILURE, _("CD-ROM lock door command failed")); + } + } + + if (ctl->i_arg) + info(_("CD-Drive may NOT be ejected with device button")); + else + info(_("CD-Drive may be ejected with device button")); +} + +/* + * Changer select. CDROM_SELECT_DISC is preferred, older kernels used + * CDROMLOADFROMSLOT. + */ +static void changer_select(const struct eject_control *ctl) +{ +#ifdef CDROM_SELECT_DISC + if (ioctl(ctl->fd, CDROM_SELECT_DISC, ctl->c_arg) < 0) + err(EXIT_FAILURE, _("CD-ROM select disc command failed")); + +#elif defined CDROMLOADFROMSLOT + if (ioctl(ctl->fd, CDROMLOADFROMSLOT, ctl->c_arg) != 0) + err(EXIT_FAILURE, _("CD-ROM load from slot command failed")); +#else + warnx(_("IDE/ATAPI CD-ROM changer not supported by this kernel\n") ); +#endif +} + +/* + * Close tray. Not supported by older kernels. + */ +static void close_tray(int fd) +{ + int status; + +#if defined(CDROMCLOSETRAY) || defined(CDIOCCLOSE) +#if defined(CDROMCLOSETRAY) + status = ioctl(fd, CDROMCLOSETRAY); +#elif defined(CDIOCCLOSE) + status = ioctl(fd, CDIOCCLOSE); +#endif + if (status != 0) + err(EXIT_FAILURE, _("CD-ROM tray close command failed")); +#else + warnx(_("CD-ROM tray close command not supported by this kernel\n")); +#endif +} + +/* + * Eject using CDROMEJECT ioctl. + */ +static int eject_cdrom(int fd) +{ +#if defined(CDROMEJECT) + int ret = ioctl(fd, CDROM_LOCKDOOR, 0); + if (ret < 0) + return 0; + return ioctl(fd, CDROMEJECT) >= 0; +#elif defined(CDIOCEJECT) + return ioctl(fd, CDIOCEJECT) >= 0; +#else + warnx(_("CD-ROM eject unsupported")); + errno = ENOSYS; + return 0; +#endif +} + +/* + * Toggle tray. + * + * Written by Benjamin Schwenk <benjaminschwenk@yahoo.de> and + * Sybren Stuvel <sybren@thirdtower.com> + * + * Not supported by older kernels because it might use + * CloseTray(). + * + */ +static void toggle_tray(int fd) +{ +#ifdef CDROM_DRIVE_STATUS + /* First ask the CDROM for info, otherwise fall back to manual. */ + switch (ioctl(fd, CDROM_DRIVE_STATUS)) { + case CDS_TRAY_OPEN: + close_tray(fd); + return; + + case CDS_NO_DISC: + case CDS_DISC_OK: + if (!eject_cdrom(fd)) + err(EXIT_FAILURE, _("CD-ROM eject command failed")); + return; + case CDS_NO_INFO: + warnx(_("no CD-ROM information available")); + return; + case CDS_DRIVE_NOT_READY: + warnx(_("CD-ROM drive is not ready")); + return; + default: + err(EXIT_FAILURE, _("CD-ROM status command failed")); + } +#else + struct timeval time_start, time_stop; + int time_elapsed; + + /* Try to open the CDROM tray and measure the time therefor + * needed. In my experience the function needs less than 0.05 + * seconds if the tray was already open, and at least 1.5 seconds + * if it was closed. */ + gettime_monotonic(&time_start); + + /* Send the CDROMEJECT command to the device. */ + if (!eject_cdrom(fd)) + err(EXIT_FAILURE, _("CD-ROM eject command failed")); + + /* Get the second timestamp, to measure the time needed to open + * the tray. */ + gettime_monotonic(&time_stop); + + time_elapsed = (time_stop.tv_sec * 1000000 + time_stop.tv_usec) - + (time_start.tv_sec * 1000000 + time_start.tv_usec); + + /* If the tray "opened" too fast, we can be nearly sure, that it + * was already open. In this case, close it now. Else the tray was + * closed before. This would mean that we are done. */ + if (time_elapsed < TRAY_WAS_ALREADY_OPEN_USECS) + close_tray(fd); +#endif +} + +/* + * Select Speed of CD-ROM drive. + * Thanks to Roland Krivanek (krivanek@fmph.uniba.sk) + * http://dmpc.dbp.fmph.uniba.sk/~krivanek/cdrom_speed/ + */ +static void select_speed(const struct eject_control *ctl) +{ +#ifdef CDROM_SELECT_SPEED + if (ioctl(ctl->fd, CDROM_SELECT_SPEED, ctl->x_arg) != 0) + err(EXIT_FAILURE, _("CD-ROM select speed command failed")); +#else + warnx(_("CD-ROM select speed command not supported by this kernel")); +#endif +} + +/* + * Read Speed of CD-ROM drive. From Linux 2.6.13, the current speed + * is correctly reported + */ +static int read_speed(const char *devname) +{ + int drive_number = -1; + char *name; + FILE *f; + + f = fopen(_PATH_PROC_CDROMINFO, "r"); + if (!f) + err(EXIT_FAILURE, _("cannot open %s"), _PATH_PROC_CDROMINFO); + + name = strrchr(devname, '/') + 1; + + while (name && !feof(f)) { + char line[512]; + char *str; + + if (!fgets(line, sizeof(line), f)) + break; + + /* find drive number in line "drive name" */ + if (drive_number == -1) { + if (strncmp(line, "drive name:", 11) == 0) { + str = strtok(&line[11], "\t "); + drive_number = 0; + while (str && strncmp(name, str, strlen(name)) != 0) { + drive_number++; + str = strtok(NULL, "\t "); + if (!str) + errx(EXIT_FAILURE, + _("%s: failed to finding CD-ROM name"), + _PATH_PROC_CDROMINFO); + } + } + /* find line "drive speed" and read the correct speed */ + } else { + if (strncmp(line, "drive speed:", 12) == 0) { + int i; + + str = strtok(&line[12], "\t "); + for (i = 1; i < drive_number; i++) + str = strtok(NULL, "\t "); + + if (!str) + errx(EXIT_FAILURE, + _("%s: failed to read speed"), + _PATH_PROC_CDROMINFO); + fclose(f); + return atoi(str); + } + } + } + + errx(EXIT_FAILURE, _("failed to read speed")); +} + +/* + * List Speed of CD-ROM drive. + */ +static void list_speeds(struct eject_control *ctl) +{ + int max_speed, curr_speed = 0; + + select_speed(ctl); + max_speed = read_speed(ctl->device); + + while (curr_speed < max_speed) { + ctl->x_arg = curr_speed + 1; + select_speed(ctl); + curr_speed = read_speed(ctl->device); + if (ctl->x_arg < curr_speed) + printf("%d ", curr_speed); + else + curr_speed = ctl->x_arg + 1; + } + + printf("\n"); +} + +/* + * Eject using SCSI SG_IO commands. Return 1 if successful, 0 otherwise. + */ +static int eject_scsi(const struct eject_control *ctl) +{ + int status, k; + sg_io_hdr_t io_hdr; + unsigned char allowRmBlk[6] = {ALLOW_MEDIUM_REMOVAL, 0, 0, 0, 0, 0}; + unsigned char startStop1Blk[6] = {START_STOP, 0, 0, 0, 1, 0}; + unsigned char startStop2Blk[6] = {START_STOP, 0, 0, 0, 2, 0}; + unsigned char inqBuff[2]; + unsigned char sense_buffer[32]; + + if ((ioctl(ctl->fd, SG_GET_VERSION_NUM, &k) < 0) || (k < 30000)) { + verbose(ctl, _("not an sg device, or old sg driver")); + return 0; + } + + memset(&io_hdr, 0, sizeof(sg_io_hdr_t)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = 6; + io_hdr.mx_sb_len = sizeof(sense_buffer); + io_hdr.dxfer_direction = SG_DXFER_NONE; + io_hdr.dxfer_len = 0; + io_hdr.dxferp = inqBuff; + io_hdr.sbp = sense_buffer; + io_hdr.timeout = 10000; + + io_hdr.cmdp = allowRmBlk; + status = ioctl(ctl->fd, SG_IO, (void *)&io_hdr); + if (status < 0 || io_hdr.host_status || io_hdr.driver_status) + return 0; + + io_hdr.cmdp = startStop1Blk; + status = ioctl(ctl->fd, SG_IO, (void *)&io_hdr); + if (status < 0 || io_hdr.host_status) + return 0; + + /* Ignore errors when there is not medium -- in this case driver sense + * buffer sets MEDIUM NOT PRESENT (3a) bit. For more details see: + * http://www.tldp.org/HOWTO/archived/SCSI-Programming-HOWTO/SCSI-Programming-HOWTO-22.html#sec-sensecodes + * -- kzak Jun 2013 + */ + if (io_hdr.driver_status != 0 && + !(io_hdr.driver_status == DRIVER_SENSE && io_hdr.sbp && + io_hdr.sbp[12] == 0x3a)) + return 0; + + io_hdr.cmdp = startStop2Blk; + status = ioctl(ctl->fd, SG_IO, (void *)&io_hdr); + if (status < 0 || io_hdr.host_status || io_hdr.driver_status) + return 0; + + /* force kernel to reread partition table when new disc inserted */ + ioctl(ctl->fd, BLKRRPART); + return 1; +} + +/* + * Eject using FDEJECT ioctl. Return 1 if successful, 0 otherwise. + */ +static int eject_floppy(int fd) +{ + return ioctl(fd, FDEJECT) >= 0; +} + + +/* + * Rewind and eject using tape ioctl. Return 1 if successful, 0 otherwise. + */ +static int eject_tape(int fd) +{ + struct mtop op = { .mt_op = MTOFFL, .mt_count = 0 }; + + return ioctl(fd, MTIOCTOP, &op) >= 0; +} + + +/* umount a device. */ +static void umount_one(const struct eject_control *ctl, const char *name) +{ + int status; + + if (!name) + return; + + verbose(ctl, _("%s: unmounting"), name); + + switch (fork()) { + case 0: /* child */ + if (setgid(getgid()) < 0) + err(EXIT_FAILURE, _("cannot set group id")); + + if (setuid(getuid()) < 0) + err(EXIT_FAILURE, _("cannot set user id")); + + if (ctl->p_option) + execl("/bin/umount", "/bin/umount", name, "-n", NULL); + else + execl("/bin/umount", "/bin/umount", name, NULL); + + errexec("/bin/umount"); + + case -1: + warn( _("unable to fork")); + break; + + default: /* parent */ + wait(&status); + if (WIFEXITED(status) == 0) + errx(EXIT_FAILURE, + _("unmount of `%s' did not exit normally"), name); + + if (WEXITSTATUS(status) != 0) + errx(EXIT_FAILURE, _("unmount of `%s' failed\n"), name); + break; + } +} + +/* Open a device file. */ +static void open_device(struct eject_control *ctl) +{ + ctl->fd = open(ctl->device, O_RDWR | O_NONBLOCK); + if (ctl->fd < 0) + ctl->fd = open(ctl->device, O_RDONLY | O_NONBLOCK); + if (ctl->fd == -1) + err(EXIT_FAILURE, _("cannot open %s"), ctl->device); +} + +/* + * See if device has been mounted by looking in mount table. If so, set + * device name and mount point name, and return 1, otherwise return 0. + */ +static int device_get_mountpoint(struct eject_control *ctl, char **devname, char **mnt) +{ + struct libmnt_fs *fs; + int rc; + + *mnt = NULL; + + if (!ctl->mtab) { + struct libmnt_cache *cache; + + ctl->mtab = mnt_new_table(); + if (!ctl->mtab) + err(EXIT_FAILURE, _("failed to initialize libmount table")); + + cache = mnt_new_cache(); + mnt_table_set_cache(ctl->mtab, cache); + mnt_unref_cache(cache); + + if (ctl->p_option) + rc = mnt_table_parse_file(ctl->mtab, _PATH_PROC_MOUNTINFO); + else + rc = mnt_table_parse_mtab(ctl->mtab, NULL); + if (rc) + err(EXIT_FAILURE, _("failed to parse mount table")); + } + + fs = mnt_table_find_source(ctl->mtab, *devname, MNT_ITER_BACKWARD); + if (!fs) { + /* maybe 'devname' is mountpoint rather than a real device */ + fs = mnt_table_find_target(ctl->mtab, *devname, MNT_ITER_BACKWARD); + if (fs) { + free(*devname); + *devname = xstrdup(mnt_fs_get_source(fs)); + } + } + + if (fs) + *mnt = xstrdup(mnt_fs_get_target(fs)); + return *mnt ? 0 : -1; +} + +static char *get_disk_devname(const char *device) +{ + struct stat st; + dev_t diskno = 0; + char diskname[128]; + + if (stat(device, &st) != 0) + return NULL; + + /* get whole-disk devno */ + if (sysfs_devno_to_wholedisk(st.st_rdev, diskname, + sizeof(diskname), &diskno) != 0) + return NULL; + + return st.st_rdev == diskno ? NULL : find_device(diskname); +} + +static int umount_partitions(struct eject_control *ctl) +{ + struct path_cxt *pc = NULL; + dev_t devno; + DIR *dir = NULL; + struct dirent *d; + int count = 0; + + devno = sysfs_devname_to_devno(ctl->device); + if (devno) + pc = ul_new_sysfs_path(devno, NULL, NULL); + if (!pc) + return 0; + + /* open /sys/block/<wholedisk> */ + if (!(dir = ul_path_opendir(pc, NULL))) + goto done; + + /* scan for partition subdirs */ + while ((d = readdir(dir))) { + if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) + continue; + + if (sysfs_blkdev_is_partition_dirent(dir, d, ctl->device)) { + char *mnt = NULL; + char *dev = find_device(d->d_name); + + if (dev && device_get_mountpoint(ctl, &dev, &mnt) == 0) { + verbose(ctl, _("%s: mounted on %s"), dev, mnt); + if (!ctl->M_option) + umount_one(ctl, mnt); + count++; + } + free(dev); + free(mnt); + } + } + +done: + if (dir) + closedir(dir); + ul_unref_path(pc); + + return count; +} + +static int is_hotpluggable(const struct eject_control *ctl) +{ + struct path_cxt *pc = NULL; + dev_t devno; + int rc = 0; + + devno = sysfs_devname_to_devno(ctl->device); + if (devno) + pc = ul_new_sysfs_path(devno, NULL, NULL); + if (!pc) + return 0; + + rc = sysfs_blkdev_is_hotpluggable(pc); + ul_unref_path(pc); + return rc; +} + + +/* handle -x option */ +static void set_device_speed(struct eject_control *ctl) +{ + if (!ctl->x_option) + return; + + if (ctl->x_arg == 0) + verbose(ctl, _("setting CD-ROM speed to auto")); + else + verbose(ctl, _("setting CD-ROM speed to %ldX"), ctl->x_arg); + + open_device(ctl); + select_speed(ctl); + exit(EXIT_SUCCESS); +} + + +/* main program */ +int main(int argc, char **argv) +{ + char *disk = NULL; + char *mountpoint = NULL; + int worked = 0; /* set to 1 when successfully ejected */ + struct eject_control ctl = { NULL }; + + setlocale(LC_ALL,""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + /* parse the command line arguments */ + parse_args(&ctl, argc, argv); + + /* handle -d option */ + if (ctl.d_option) { + info(_("default device: `%s'"), EJECT_DEFAULT_DEVICE); + return EXIT_SUCCESS; + } + + if (!ctl.device) { + ctl.device = mnt_resolve_path(EJECT_DEFAULT_DEVICE, NULL); + verbose(&ctl, _("using default device `%s'"), ctl.device); + } else { + char *p; + + if (ctl.device[strlen(ctl.device) - 1] == '/') + ctl.device[strlen(ctl.device) - 1] = '\0'; + + /* figure out full device or mount point name */ + p = find_device(ctl.device); + if (p) + free(ctl.device); + else + p = ctl.device; + + ctl.device = mnt_resolve_spec(p, NULL); + free(p); + } + + if (!ctl.device) + errx(EXIT_FAILURE, _("%s: unable to find device"), ctl.device); + + verbose(&ctl, _("device name is `%s'"), ctl.device); + + device_get_mountpoint(&ctl, &ctl.device, &mountpoint); + if (mountpoint) + verbose(&ctl, _("%s: mounted on %s"), ctl.device, mountpoint); + else + verbose(&ctl, _("%s: not mounted"), ctl.device); + + disk = get_disk_devname(ctl.device); + if (disk) { + verbose(&ctl, _("%s: disc device: %s (disk device will be used for eject)"), ctl.device, disk); + free(ctl.device); + ctl.device = disk; + disk = NULL; + } else { + struct stat st; + + if (stat(ctl.device, &st) != 0 || !S_ISBLK(st.st_mode)) + errx(EXIT_FAILURE, _("%s: not found mountpoint or device " + "with the given name"), ctl.device); + + verbose(&ctl, _("%s: is whole-disk device"), ctl.device); + } + + if (ctl.F_option == 0 && is_hotpluggable(&ctl) == 0) + errx(EXIT_FAILURE, _("%s: is not hot-pluggable device"), ctl.device); + + /* handle -n option */ + if (ctl.n_option) { + info(_("device is `%s'"), ctl.device); + verbose(&ctl, _("exiting due to -n/--noop option")); + return EXIT_SUCCESS; + } + + /* handle -i option */ + if (ctl.i_option) { + open_device(&ctl); + manual_eject(&ctl); + return EXIT_SUCCESS; + } + + /* handle -a option */ + if (ctl.a_option) { + if (ctl.a_arg) + verbose(&ctl, _("%s: enabling auto-eject mode"), ctl.device); + else + verbose(&ctl, _("%s: disabling auto-eject mode"), ctl.device); + open_device(&ctl); + auto_eject(&ctl); + return EXIT_SUCCESS; + } + + /* handle -t option */ + if (ctl.t_option) { + verbose(&ctl, _("%s: closing tray"), ctl.device); + open_device(&ctl); + close_tray(ctl.fd); + set_device_speed(&ctl); + return EXIT_SUCCESS; + } + + /* handle -T option */ + if (ctl.T_option) { + verbose(&ctl, _("%s: toggling tray"), ctl.device); + open_device(&ctl); + toggle_tray(ctl.fd); + set_device_speed(&ctl); + return EXIT_SUCCESS; + } + + /* handle -X option */ + if (ctl.X_option) { + verbose(&ctl, _("%s: listing CD-ROM speed"), ctl.device); + open_device(&ctl); + list_speeds(&ctl); + return EXIT_SUCCESS; + } + + /* handle -x option only */ + if (!ctl.c_option) + set_device_speed(&ctl); + + + /* + * Unmount all partitions if -m is not specified; or umount given + * mountpoint if -M is specified, otherwise print error of another + * partition is mounted. + */ + if (!ctl.m_option) { + int ct = umount_partitions(&ctl); + + if (ct == 0 && mountpoint) + umount_one(&ctl, mountpoint); /* probably whole-device */ + + if (ctl.M_option) { + if (ct == 1 && mountpoint) + umount_one(&ctl, mountpoint); + else if (ct) + errx(EXIT_FAILURE, _("error: %s: device in use"), ctl.device); + } + } + + /* handle -c option */ + if (ctl.c_option) { + verbose(&ctl, _("%s: selecting CD-ROM disc #%ld"), ctl.device, ctl.c_arg); + open_device(&ctl); + changer_select(&ctl); + set_device_speed(&ctl); + return EXIT_SUCCESS; + } + + /* if user did not specify type of eject, try all four methods */ + if (ctl.r_option + ctl.s_option + ctl.f_option + ctl.q_option == 0) + ctl.r_option = ctl.s_option = ctl.f_option = ctl.q_option = 1; + + /* open device */ + open_device(&ctl); + + /* try various methods of ejecting until it works */ + if (ctl.r_option) { + verbose(&ctl, _("%s: trying to eject using CD-ROM eject command"), ctl.device); + worked = eject_cdrom(ctl.fd); + verbose(&ctl, worked ? _("CD-ROM eject command succeeded") : + _("CD-ROM eject command failed")); + } + + if (ctl.s_option && !worked) { + verbose(&ctl, _("%s: trying to eject using SCSI commands"), ctl.device); + worked = eject_scsi(&ctl); + verbose(&ctl, worked ? _("SCSI eject succeeded") : + _("SCSI eject failed")); + } + + if (ctl.f_option && !worked) { + verbose(&ctl, _("%s: trying to eject using floppy eject command"), ctl.device); + worked = eject_floppy(ctl.fd); + verbose(&ctl, worked ? _("floppy eject command succeeded") : + _("floppy eject command failed")); + } + + if (ctl.q_option && !worked) { + verbose(&ctl, _("%s: trying to eject using tape offline command"), ctl.device); + worked = eject_tape(ctl.fd); + verbose(&ctl, worked ? _("tape offline command succeeded") : + _("tape offline command failed")); + } + + if (!worked) + errx(EXIT_FAILURE, _("unable to eject")); + + /* cleanup */ + close(ctl.fd); + free(ctl.device); + free(mountpoint); + + mnt_unref_table(ctl.mtab); + + return EXIT_SUCCESS; +} diff --git a/sys-utils/fallocate.1 b/sys-utils/fallocate.1 new file mode 100644 index 0000000..fe5072a --- /dev/null +++ b/sys-utils/fallocate.1 @@ -0,0 +1,191 @@ +.TH FALLOCATE 1 "April 2014" "util-linux" "User Commands" +.SH NAME +fallocate \- preallocate or deallocate space to a file +.SH SYNOPSIS +.B fallocate +.RB [ \-c | \-p | \-z ] +.RB [ \-o +.IR offset ] +.B \-l +.I length +.RB [ \-n ] +.I filename +.PP +.B fallocate \-d +.RB [ \-o +.IR offset ] +.RB [ \-l +.IR length ] +.I filename +.PP +.B fallocate \-x +.RB [ \-o +.IR offset ] +.B \-l +.I length +.I filename +.SH DESCRIPTION +.B fallocate +is used to manipulate the allocated disk space for a file, +either to deallocate or preallocate it. +For filesystems which support the fallocate system call, +preallocation is done quickly by allocating blocks and marking them as +uninitialized, requiring no IO to the data blocks. +This is much faster than creating a file by filling it with zeroes. +.PP +The exit code returned by +.B fallocate +is 0 on success and 1 on failure. +.SH OPTIONS +The +.I length +and +.I offset +arguments may be followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB, and YiB (the "iB" is +optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB, and YB. +.PP +The options +.BR \-\-collapse\-range ", " \-\-dig\-holes ", " \-\-punch\-hole , +and +.B \-\-zero\-range +are mutually exclusive. +.TP +.BR \-c ", " \-\-collapse\-range +Removes a byte range from a file, without leaving a hole. +The byte range to be collapsed starts at +.I offset +and continues for +.I length +bytes. +At the completion of the operation, +the contents of the file starting at the location +.IR offset + length +will be appended at the location +.IR offset , +and the file will be +.I length +bytes smaller. +The option +.B \-\-keep\-size +may not be specified for the collapse-range operation. +.sp +Available since Linux 3.15 for ext4 (only for extent-based files) and XFS. +.sp +A filesystem may place limitations on the granularity of the operation, in +order to ensure efficient implementation. Typically, offset and len must be a +multiple of the filesystem logical block size, which varies according to the +filesystem type and configuration. If a filesystem has such a requirement, +the operation will fail with the error EINVAL if this requirement is violated. +.TP +.BR \-d ", " \-\-dig\-holes +Detect and dig holes. +This makes the file sparse in-place, without using extra disk space. +The minimum size of the hole depends on filesystem I/O block size +(usually 4096 bytes). +Also, when using this option, +.B \-\-keep\-size +is implied. If no range is specified by +.B \-\-offset +and +.BR \-\-length , +then the entire file is analyzed for holes. +.sp +You can think of this option as doing a +.RB """" "cp \-\-sparse" """" +and then renaming the destination file to the original, +without the need for extra disk space. +.sp +See \fB\-\-punch\-hole\fP for a list of supported filesystems. +.TP +.BR \-i ", " \-\-insert\-range +Insert a hole of +.I length +bytes from +.IR offset , +shifting existing data. +.TP +.BR \-l ", " "\-\-length " \fIlength +Specifies the length of the range, in bytes. +.TP +.BR \-n ", " \-\-keep\-size +Do not modify the apparent length of the file. This may effectively allocate +blocks past EOF, which can be removed with a truncate. +.TP +.BR \-o ", " "\-\-offset " \fIoffset +Specifies the beginning offset of the range, in bytes. +.TP +.BR \-p ", " \-\-punch\-hole +Deallocates space (i.e., creates a hole) in the byte range starting at +.I offset +and continuing for +.I length +bytes. +Within the specified range, partial filesystem blocks are zeroed, +and whole filesystem blocks are removed from the file. +After a successful call, +subsequent reads from this range will return zeroes. +This option may not be specified at the same time as the +.B \-\-zero\-range +option. +Also, when using this option, +.B \-\-keep\-size +is implied. +.sp +Supported for XFS (since Linux 2.6.38), ext4 (since Linux 3.0), +Btrfs (since Linux 3.7) and tmpfs (since Linux 3.5). +.TP +.BR \-v ", " \-\-verbose +Enable verbose mode. +.TP +.BR \-x ", " \-\-posix +Enable POSIX operation mode. +In that mode allocation operation always completes, +but it may take longer time when fast allocation is not supported by +the underlying filesystem. +.TP +.BR \-z ", " \-\-zero\-range +Zeroes space in the byte range starting at +.I offset +and continuing for +.I length +bytes. +Within the specified range, blocks are preallocated for the regions +that span the holes in the file. +After a successful call, +subsequent reads from this range will return zeroes. +.sp +Zeroing is done within the filesystem preferably by converting the +range into unwritten extents. This approach means that the specified +range will not be physically zeroed out on the device (except for +partial blocks at the either end of the range), and I/O is +(otherwise) required only to update metadata. +.sp +Option \fB\-\-keep\-size\fP can be specified to prevent file length +modification. +.sp +Available since Linux 3.14 for ext4 (only for extent-based files) and XFS. +.TP +.BR \-V ", " \-\-version +Display version information and exit. +.TP +.BR \-h ", " \-\-help +Display help text and exit. +.SH AUTHORS +.MT sandeen@redhat.com +Eric Sandeen +.ME +.br +.MT kzak@redhat.com +Karel Zak +.ME +.SH SEE ALSO +.BR truncate (1), +.BR fallocate (2), +.BR posix_fallocate (3) +.SH AVAILABILITY +The fallocate command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/fallocate.c b/sys-utils/fallocate.c new file mode 100644 index 0000000..ba3867c --- /dev/null +++ b/sys-utils/fallocate.c @@ -0,0 +1,412 @@ +/* + * fallocate - utility to use the fallocate system call + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * Written by Eric Sandeen <sandeen@redhat.com> + * Karel Zak <kzak@redhat.com> + * + * cvtnum routine taken from xfsprogs, + * Copyright (c) 2003-2005 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <getopt.h> +#include <limits.h> +#include <string.h> + +#ifndef HAVE_FALLOCATE +# include <sys/syscall.h> +#endif + +#if defined(HAVE_LINUX_FALLOC_H) && \ + (!defined(FALLOC_FL_KEEP_SIZE) || !defined(FALLOC_FL_PUNCH_HOLE) || \ + !defined(FALLOC_FL_COLLAPSE_RANGE) || !defined(FALLOC_FL_ZERO_RANGE) || \ + !defined(FALLOC_FL_INSERT_RANGE)) +# include <linux/falloc.h> /* non-libc fallback for FALLOC_FL_* flags */ +#endif + + +#ifndef FALLOC_FL_KEEP_SIZE +# define FALLOC_FL_KEEP_SIZE 0x1 +#endif + +#ifndef FALLOC_FL_PUNCH_HOLE +# define FALLOC_FL_PUNCH_HOLE 0x2 +#endif + +#ifndef FALLOC_FL_COLLAPSE_RANGE +# define FALLOC_FL_COLLAPSE_RANGE 0x8 +#endif + +#ifndef FALLOC_FL_ZERO_RANGE +# define FALLOC_FL_ZERO_RANGE 0x10 +#endif + +#ifndef FALLOC_FL_INSERT_RANGE +# define FALLOC_FL_INSERT_RANGE 0x20 +#endif + +#include "nls.h" +#include "strutils.h" +#include "c.h" +#include "closestream.h" +#include "xalloc.h" +#include "optutils.h" + +static int verbose; +static char *filename; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] <filename>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Preallocate space to, or deallocate space from a file.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -c, --collapse-range remove a range from the file\n"), out); + fputs(_(" -d, --dig-holes detect zeroes and replace with holes\n"), out); + fputs(_(" -i, --insert-range insert a hole at range, shifting existing data\n"), out); + fputs(_(" -l, --length <num> length for range operations, in bytes\n"), out); + fputs(_(" -n, --keep-size maintain the apparent size of the file\n"), out); + fputs(_(" -o, --offset <num> offset for range operations, in bytes\n"), out); + fputs(_(" -p, --punch-hole replace a range with a hole (implies -n)\n"), out); + fputs(_(" -z, --zero-range zero and ensure allocation of a range\n"), out); +#ifdef HAVE_POSIX_FALLOCATE + fputs(_(" -x, --posix use posix_fallocate(3) instead of fallocate(2)\n"), out); +#endif + fputs(_(" -v, --verbose verbose mode\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(22)); + + printf(USAGE_MAN_TAIL("fallocate(1)")); + + exit(EXIT_SUCCESS); +} + +static loff_t cvtnum(char *s) +{ + uintmax_t x; + + if (strtosize(s, &x)) + return -1LL; + + return x; +} + +static void xfallocate(int fd, int mode, off_t offset, off_t length) +{ + int error; + +#ifdef HAVE_FALLOCATE + error = fallocate(fd, mode, offset, length); +#else + error = syscall(SYS_fallocate, fd, mode, offset, length); +#endif + /* + * EOPNOTSUPP: The FALLOC_FL_KEEP_SIZE is unsupported + * ENOSYS: The filesystem does not support sys_fallocate + */ + if (error < 0) { + if ((mode & FALLOC_FL_KEEP_SIZE) && errno == EOPNOTSUPP) + errx(EXIT_FAILURE, _("fallocate failed: keep size mode is unsupported")); + err(EXIT_FAILURE, _("fallocate failed")); + } +} + +#ifdef HAVE_POSIX_FALLOCATE +static void xposix_fallocate(int fd, off_t offset, off_t length) +{ + int error = posix_fallocate(fd, offset, length); + if (error < 0) { + err(EXIT_FAILURE, _("fallocate failed")); + } +} +#endif + +/* The real buffer size has to be bufsize + sizeof(uintptr_t) */ +static int is_nul(void *buf, size_t bufsize) +{ + typedef uintptr_t word; + void const *vp; + char const *cbuf = buf, *cp; + word const *wp = buf; + + /* set sentinel */ + memset((char *) buf + bufsize, '\1', sizeof(word)); + + /* Find first nonzero *word*, or the word with the sentinel. */ + while (*wp++ == 0) + continue; + + /* Find the first nonzero *byte*, or the sentinel. */ + vp = wp - 1; + cp = vp; + + while (*cp++ == 0) + continue; + + return cbuf + bufsize < cp; +} + +static void dig_holes(int fd, off_t file_off, off_t len) +{ + off_t file_end = len ? file_off + len : 0; + off_t hole_start = 0, hole_sz = 0; + uintmax_t ct = 0; + size_t bufsz; + char *buf; + struct stat st; +#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE) + off_t cache_start = file_off; + /* + * We don't want to call POSIX_FADV_DONTNEED to discard cached + * data in PAGE_SIZE steps. IMHO it's overkill (too many syscalls). + * + * Let's assume that 1MiB (on system with 4K page size) is just + * a good compromise. + * -- kzak Feb-2014 + */ + const size_t cachesz = getpagesize() * 256; +#endif + + if (fstat(fd, &st) != 0) + err(EXIT_FAILURE, _("stat of %s failed"), filename); + + bufsz = st.st_blksize; + + if (lseek(fd, file_off, SEEK_SET) < 0) + err(EXIT_FAILURE, _("seek on %s failed"), filename); + + /* buffer + extra space for is_nul() sentinel */ + buf = xmalloc(bufsz + sizeof(uintptr_t)); + while (file_end == 0 || file_off < file_end) { + /* + * Detect data area (skip holes) + */ + off_t end, off; + + off = lseek(fd, file_off, SEEK_DATA); + if ((off == -1 && errno == ENXIO) || + (file_end && off >= file_end)) + break; + + end = lseek(fd, off, SEEK_HOLE); + if (file_end && end > file_end) + end = file_end; + +#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE) + posix_fadvise(fd, off, end, POSIX_FADV_SEQUENTIAL); +#endif + /* + * Dig holes in the area + */ + while (off < end) { + ssize_t rsz = pread(fd, buf, bufsz, off); + if (rsz < 0 && errno) + err(EXIT_FAILURE, _("%s: read failed"), filename); + if (end && rsz > 0 && off > end - rsz) + rsz = end - off; + if (rsz <= 0) + break; + + if (is_nul(buf, rsz)) { + if (!hole_sz) /* new hole detected */ + hole_start = off; + hole_sz += rsz; + } else if (hole_sz) { + xfallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, + hole_start, hole_sz); + ct += hole_sz; + hole_sz = hole_start = 0; + } + +#if defined(POSIX_FADV_DONTNEED) && defined(HAVE_POSIX_FADVISE) + /* discard cached data */ + if (off - cache_start > (off_t) cachesz) { + size_t clen = off - cache_start; + + clen = (clen / cachesz) * cachesz; + posix_fadvise(fd, cache_start, clen, POSIX_FADV_DONTNEED); + cache_start = cache_start + clen; + } +#endif + off += rsz; + } + if (hole_sz) { + xfallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, + hole_start, hole_sz); + ct += hole_sz; + } + file_off = off; + } + + free(buf); + + if (verbose) { + char *str = size_to_human_string(SIZE_SUFFIX_3LETTER | SIZE_SUFFIX_SPACE, ct); + fprintf(stdout, _("%s: %s (%ju bytes) converted to sparse holes.\n"), + filename, str, ct); + free(str); + } +} + +int main(int argc, char **argv) +{ + int c; + int fd; + int mode = 0; + int dig = 0; + int posix = 0; + loff_t length = -2LL; + loff_t offset = 0; + + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "keep-size", no_argument, NULL, 'n' }, + { "punch-hole", no_argument, NULL, 'p' }, + { "collapse-range", no_argument, NULL, 'c' }, + { "dig-holes", no_argument, NULL, 'd' }, + { "insert-range", no_argument, NULL, 'i' }, + { "zero-range", no_argument, NULL, 'z' }, + { "offset", required_argument, NULL, 'o' }, + { "length", required_argument, NULL, 'l' }, + { "posix", no_argument, NULL, 'x' }, + { "verbose", no_argument, NULL, 'v' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'c', 'd', 'p', 'z' }, + { 'c', 'n' }, + { 'x', 'c', 'd', 'i', 'n', 'p', 'z'}, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "hvVncpdizxl:o:", longopts, NULL)) + != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch(c) { + case 'h': + usage(); + break; + case 'c': + mode |= FALLOC_FL_COLLAPSE_RANGE; + break; + case 'd': + dig = 1; + break; + case 'i': + mode |= FALLOC_FL_INSERT_RANGE; + break; + case 'l': + length = cvtnum(optarg); + break; + case 'n': + mode |= FALLOC_FL_KEEP_SIZE; + break; + case 'o': + offset = cvtnum(optarg); + break; + case 'p': + mode |= FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + break; + case 'z': + mode |= FALLOC_FL_ZERO_RANGE; + break; + case 'x': +#ifdef HAVE_POSIX_FALLOCATE + posix = 1; + break; +#else + errx(EXIT_FAILURE, _("posix_fallocate support is not compiled")); +#endif + case 'v': + verbose++; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optind == argc) + errx(EXIT_FAILURE, _("no filename specified")); + + filename = argv[optind++]; + + if (optind != argc) + errx(EXIT_FAILURE, _("unexpected number of arguments")); + + if (dig) { + /* for --dig-holes the default is analyze all file */ + if (length == -2LL) + length = 0; + if (length < 0) + errx(EXIT_FAILURE, _("invalid length value specified")); + } else { + /* it's safer to require the range specification (--length --offset) */ + if (length == -2LL) + errx(EXIT_FAILURE, _("no length argument specified")); + if (length <= 0) + errx(EXIT_FAILURE, _("invalid length value specified")); + } + if (offset < 0) + errx(EXIT_FAILURE, _("invalid offset value specified")); + + /* O_CREAT makes sense only for the default fallocate(2) behavior + * when mode is no specified and new space is allocated */ + fd = open(filename, O_RDWR | (!dig && !mode ? O_CREAT : 0), + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), filename); + + if (dig) + dig_holes(fd, offset, length); +#ifdef HAVE_POSIX_FALLOCATE + else if (posix) + xposix_fallocate(fd, offset, length); +#endif + else + xfallocate(fd, mode, offset, length); + + if (close_fd(fd) != 0) + err(EXIT_FAILURE, _("write failed: %s"), filename); + + return EXIT_SUCCESS; +} diff --git a/sys-utils/flock.1 b/sys-utils/flock.1 new file mode 100644 index 0000000..5b1d635 --- /dev/null +++ b/sys-utils/flock.1 @@ -0,0 +1,197 @@ +.\" ----------------------------------------------------------------------- +.\" +.\" Copyright 2003-2006 H. Peter Anvin - All Rights Reserved +.\" +.\" Permission is hereby granted, free of charge, to any person +.\" obtaining a copy of this software and associated documentation +.\" files (the "Software"), to deal in the Software without +.\" restriction, including without limitation the rights to use, +.\" copy, modify, merge, publish, distribute, sublicense, and/or +.\" sell copies of the Software, and to permit persons to whom +.\" the Software is furnished to do so, subject to the following +.\" conditions: +.\" +.\" The above copyright notice and this permission notice shall +.\" be included in all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +.\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +.\" OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +.\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +.\" HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +.\" WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +.\" OTHER DEALINGS IN THE SOFTWARE. +.\" +.\" ----------------------------------------------------------------------- +.TH FLOCK 1 "July 2014" "util-linux" "User Commands" +.SH NAME +flock \- manage locks from shell scripts +.SH SYNOPSIS +.B flock +[options] +.IR file | "directory command " [ arguments ] +.br +.B flock +[options] +.IR file | directory +.BI \-c " command" +.br +.B flock +.RI [options] " number" +.SH DESCRIPTION +.PP +This utility manages +.BR flock (2) +locks from within shell scripts or from the command line. +.PP +The first and second of the above forms wrap the lock around the execution of a +.IR command , +in a manner similar to +.BR su (1) +or +.BR newgrp (1). +They lock a specified \fIfile\fR or \fIdirectory\fR, which is created (assuming +appropriate permissions) if it does not already exist. By default, if the +lock cannot be immediately acquired, +.B flock +waits until the lock is available. +.PP +The third form uses an open file by its file descriptor \fInumber\fR. +See the examples below for how that can be used. +.SH OPTIONS +.TP +.BR \-c , " \-\-command " \fIcommand +Pass a single \fIcommand\fR, without arguments, to the shell with +.BR \-c . +.TP +.BR \-E , " \-\-conflict\-exit\-code " \fInumber +The exit code used when the \fB\-n\fP option is in use, and the +conflicting lock exists, or the \fB\-w\fP option is in use, +and the timeout is reached. The default value is \fB1\fR. +.TP +.BR \-F , " \-\-no\-fork" +Do not fork before executing +.IR command . +Upon execution the flock process is replaced by +.I command +which continues to hold the lock. This option is incompatible with +\fB\-\-close\fR as there would otherwise be nothing left to hold the lock. +.TP +.BR \-e , " \-x" , " \-\-exclusive" +Obtain an exclusive lock, sometimes called a write lock. This is the +default. +.TP +.BR \-n , " \-\-nb" , " \-\-nonblock" +Fail rather than wait if the lock cannot be +immediately acquired. +See the +.B \-E +option for the exit code used. +.TP +.BR \-o , " \-\-close" +Close the file descriptor on which the lock is held before executing +.IR command . +This is useful if +.I command +spawns a child process which should not be holding the lock. +.TP +.BR \-s , " \-\-shared" +Obtain a shared lock, sometimes called a read lock. +.TP +.BR \-u , " \-\-unlock" +Drop a lock. This is usually not required, since a lock is automatically +dropped when the file is closed. However, it may be required in special +cases, for example if the enclosed command group may have forked a background +process which should not be holding the lock. +.TP +.BR \-w , " \-\-wait" , " \-\-timeout " \fIseconds +Fail if the lock cannot be acquired within +.IR seconds . +Decimal fractional values are allowed. +See the +.B \-E +option for the exit code used. The zero number of +.I seconds +is interpreted as \fB\-\-nonblock\fR. +.TP +.B \-\-verbose +Report how long it took to acquire the lock, or why the lock could not be +obtained. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH EXAMPLES +.TP +shell1> flock /tmp -c cat +.TQ +shell2> flock -w .007 /tmp -c echo; /bin/echo $? +Set exclusive lock to directory /tmp and the second command will fail. +.TP +shell1> flock -s /tmp -c cat +.TQ +shell2> flock -s -w .007 /tmp -c echo; /bin/echo $? +Set shared lock to directory /tmp and the second command will not fail. +Notice that attempting to get exclusive lock with second command would fail. +.TP +shell> flock -x local-lock-file echo 'a b c' +Grab the exclusive lock "local-lock-file" before running echo with 'a b c'. +.TP +( +.TQ + flock -n 9 || exit 1 +.TQ + # ... commands executed under lock ... +.TQ +) 9>/var/lock/mylockfile +The form is convenient inside shell scripts. The mode used to open the file +doesn't matter to +.BR flock ; +using +.I > +or +.I >> +allows the lockfile to be created if it does not already exist, however, +write permission is required. Using +.I < +requires that the file already exists but only read permission is required. +.TP +[ "${FLOCKER}" != "$0" ] && exec env FLOCKER="$0" flock -en "$0" "$0" "$@" || : +This is useful boilerplate code for shell scripts. Put it at the top of the +shell script you want to lock and it'll automatically lock itself on the first +run. If the env var $FLOCKER is not set to the shell script that is being run, +then execute flock and grab an exclusive non-blocking lock (using the script +itself as the lock file) before re-execing itself with the right arguments. It +also sets the FLOCKER env var to the right value so it doesn't run again. +.SH "EXIT STATUS" +The command uses +.B sysexits.h +return values for everything, except when using either of the options +.B \-n +or +.B \-w +which report a failure to acquire the lock with a return value given by the +.B \-E +option, or 1 by default. +.PP +When using the \fIcommand\fR variant, and executing the child worked, then +the exit status is that of the child command. +.SH AUTHOR +.UR hpa@zytor.com +H. Peter Anvin +.UE +.SH COPYRIGHT +Copyright \(co 2003\-2006 H. Peter Anvin. +.br +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +.SH "SEE ALSO" +.BR flock (2) +.SH AVAILABILITY +The flock command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/flock.c b/sys-utils/flock.c new file mode 100644 index 0000000..ed25230 --- /dev/null +++ b/sys-utils/flock.c @@ -0,0 +1,380 @@ +/* Copyright 2003-2005 H. Peter Anvin - All Rights Reserved + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall + * be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <paths.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "c.h" +#include "nls.h" +#include "strutils.h" +#include "closestream.h" +#include "monotonic.h" +#include "timer.h" + +static void __attribute__((__noreturn__)) usage(void) +{ + fputs(USAGE_HEADER, stdout); + printf( + _(" %1$s [options] <file>|<directory> <command> [<argument>...]\n" + " %1$s [options] <file>|<directory> -c <command>\n" + " %1$s [options] <file descriptor number>\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, stdout); + fputs(_("Manage file locks from shell scripts.\n"), stdout); + + fputs(USAGE_OPTIONS, stdout); + fputs(_( " -s, --shared get a shared lock\n"), stdout); + fputs(_( " -x, --exclusive get an exclusive lock (default)\n"), stdout); + fputs(_( " -u, --unlock remove a lock\n"), stdout); + fputs(_( " -n, --nonblock fail rather than wait\n"), stdout); + fputs(_( " -w, --timeout <secs> wait for a limited amount of time\n"), stdout); + fputs(_( " -E, --conflict-exit-code <number> exit code after conflict or timeout\n"), stdout); + fputs(_( " -o, --close close file descriptor before running command\n"), stdout); + fputs(_( " -c, --command <command> run a single command string through the shell\n"), stdout); + fputs(_( " -F, --no-fork execute command without forking\n"), stdout); + fputs(_( " --verbose increase verbosity\n"), stdout); + fputs(USAGE_SEPARATOR, stdout); + printf(USAGE_HELP_OPTIONS(26)); + printf(USAGE_MAN_TAIL("flock(1)")); + exit(EXIT_SUCCESS); +} + +static sig_atomic_t timeout_expired = 0; + +static void timeout_handler(int sig __attribute__((__unused__)), + siginfo_t *info, + void *context __attribute__((__unused__))) +{ + if (info->si_code == SI_TIMER) + timeout_expired = 1; +} + +static int open_file(const char *filename, int *flags) +{ + + int fd; + int fl = *flags == 0 ? O_RDONLY : *flags; + + errno = 0; + fl |= O_NOCTTY | O_CREAT; + fd = open(filename, fl, 0666); + + /* Linux doesn't like O_CREAT on a directory, even though it + * should be a no-op; POSIX doesn't allow O_RDWR or O_WRONLY + */ + if (fd < 0 && errno == EISDIR) { + fl = O_RDONLY | O_NOCTTY; + fd = open(filename, fl); + } + if (fd < 0) { + warn(_("cannot open lock file %s"), filename); + if (errno == ENOMEM || errno == EMFILE || errno == ENFILE) + exit(EX_OSERR); + if (errno == EROFS || errno == ENOSPC) + exit(EX_CANTCREAT); + exit(EX_NOINPUT); + } + *flags = fl; + return fd; +} + +static void __attribute__((__noreturn__)) run_program(char **cmd_argv) +{ + execvp(cmd_argv[0], cmd_argv); + + warn(_("failed to execute %s"), cmd_argv[0]); + _exit((errno == ENOMEM) ? EX_OSERR : EX_UNAVAILABLE); +} + +int main(int argc, char *argv[]) +{ + static timer_t t_id; + struct itimerval timeout; + int have_timeout = 0; + int type = LOCK_EX; + int block = 0; + int open_flags = 0; + int fd = -1; + int opt, ix; + int do_close = 0; + int no_fork = 0; + int status; + int verbose = 0; + struct timeval time_start, time_done; + /* + * The default exit code for lock conflict or timeout + * is specified in man flock.1 + */ + int conflict_exit_code = 1; + char **cmd_argv = NULL, *sh_c_argv[4]; + const char *filename = NULL; + enum { + OPT_VERBOSE = CHAR_MAX + 1 + }; + static const struct option long_options[] = { + {"shared", no_argument, NULL, 's'}, + {"exclusive", no_argument, NULL, 'x'}, + {"unlock", no_argument, NULL, 'u'}, + {"nonblocking", no_argument, NULL, 'n'}, + {"nb", no_argument, NULL, 'n'}, + {"timeout", required_argument, NULL, 'w'}, + {"wait", required_argument, NULL, 'w'}, + {"conflict-exit-code", required_argument, NULL, 'E'}, + {"close", no_argument, NULL, 'o'}, + {"no-fork", no_argument, NULL, 'F'}, + {"verbose", no_argument, NULL, OPT_VERBOSE}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + strutils_set_exitcode(EX_USAGE); + + if (argc < 2) { + warnx(_("not enough arguments")); + errtryhelp(EX_USAGE); + } + + memset(&timeout, 0, sizeof timeout); + + optopt = 0; + while ((opt = + getopt_long(argc, argv, "+sexnoFuw:E:hV?", long_options, + &ix)) != EOF) { + switch (opt) { + case 's': + type = LOCK_SH; + break; + case 'e': + case 'x': + type = LOCK_EX; + break; + case 'u': + type = LOCK_UN; + break; + case 'o': + do_close = 1; + break; + case 'F': + no_fork = 1; + break; + case 'n': + block = LOCK_NB; + break; + case 'w': + have_timeout = 1; + strtotimeval_or_err(optarg, &timeout.it_value, + _("invalid timeout value")); + break; + case 'E': + conflict_exit_code = strtos32_or_err(optarg, + _("invalid exit code")); + break; + case OPT_VERBOSE: + verbose = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + exit(EX_OK); + case 'h': + usage(); + default: + errtryhelp(EX_USAGE); + } + } + + if (no_fork && do_close) + errx(EX_USAGE, + _("the --no-fork and --close options are incompatible")); + + if (argc > optind + 1) { + /* Run command */ + if (!strcmp(argv[optind + 1], "-c") || + !strcmp(argv[optind + 1], "--command")) { + if (argc != optind + 3) + errx(EX_USAGE, + _("%s requires exactly one command argument"), + argv[optind + 1]); + cmd_argv = sh_c_argv; + cmd_argv[0] = getenv("SHELL"); + if (!cmd_argv[0] || !*cmd_argv[0]) + cmd_argv[0] = _PATH_BSHELL; + cmd_argv[1] = "-c"; + cmd_argv[2] = argv[optind + 2]; + cmd_argv[3] = NULL; + } else { + cmd_argv = &argv[optind + 1]; + } + + filename = argv[optind]; + fd = open_file(filename, &open_flags); + + } else if (optind < argc) { + /* Use provided file descriptor */ + fd = strtos32_or_err(argv[optind], _("bad file descriptor")); + } else { + /* Bad options */ + errx(EX_USAGE, _("requires file descriptor, file or directory")); + } + + if (have_timeout) { + if (timeout.it_value.tv_sec == 0 && + timeout.it_value.tv_usec == 0) { + /* -w 0 is equivalent to -n; this has to be + * special-cased because setting an itimer to zero + * means disabled! + */ + have_timeout = 0; + block = LOCK_NB; + } else + if (setup_timer(&t_id, &timeout, &timeout_handler)) + err(EX_OSERR, _("cannot set up timer")); + } + + if (verbose) + gettime_monotonic(&time_start); + while (flock(fd, type | block)) { + switch (errno) { + case EWOULDBLOCK: + /* -n option set and failed to lock. */ + if (verbose) + warnx(_("failed to get lock")); + exit(conflict_exit_code); + case EINTR: + /* Signal received */ + if (timeout_expired) { + /* -w option set and failed to lock. */ + if (verbose) + warnx(_("timeout while waiting to get lock")); + exit(conflict_exit_code); + } + /* otherwise try again */ + continue; + case EIO: + case EBADF: /* since Linux 3.4 (commit 55725513) */ + /* Probably NFSv4 where flock() is emulated by fcntl(). + * Let's try to reopen in read-write mode. + */ + if (!(open_flags & O_RDWR) && + type != LOCK_SH && + filename && + access(filename, R_OK | W_OK) == 0) { + + close(fd); + open_flags = O_RDWR; + fd = open_file(filename, &open_flags); + + if (open_flags & O_RDWR) + break; + } + /* fallthrough */ + default: + /* Other errors */ + if (filename) + warn("%s", filename); + else + warn("%d", fd); + exit((errno == ENOLCK + || errno == ENOMEM) ? EX_OSERR : EX_DATAERR); + } + } + + if (have_timeout) + cancel_timer(&t_id); + if (verbose) { + struct timeval delta; + + gettime_monotonic(&time_done); + timersub(&time_done, &time_start, &delta); + printf(_("%s: getting lock took %ld.%06ld seconds\n"), + program_invocation_short_name, delta.tv_sec, + delta.tv_usec); + } + status = EX_OK; + + if (cmd_argv) { + pid_t w, f; + /* Clear any inherited settings */ + signal(SIGCHLD, SIG_DFL); + if (verbose) + printf(_("%s: executing %s\n"), program_invocation_short_name, cmd_argv[0]); + + if (!no_fork) { + f = fork(); + if (f < 0) + err(EX_OSERR, _("fork failed")); + + /* child */ + else if (f == 0) { + if (do_close) + close(fd); + run_program(cmd_argv); + + /* parent */ + } else { + do { + w = waitpid(f, &status, 0); + if (w == -1 && errno != EINTR) + break; + } while (w != f); + + if (w == -1) { + status = EXIT_FAILURE; + warn(_("waitpid failed")); + } else if (WIFEXITED(status)) + status = WEXITSTATUS(status); + else if (WIFSIGNALED(status)) + status = WTERMSIG(status) + 128; + else + /* WTF? */ + status = EX_OSERR; + } + + } else + /* no-fork execution */ + run_program(cmd_argv); + } + + return status; +} diff --git a/sys-utils/fsfreeze.8 b/sys-utils/fsfreeze.8 new file mode 100644 index 0000000..3cd6738 --- /dev/null +++ b/sys-utils/fsfreeze.8 @@ -0,0 +1,89 @@ +.TH FSFREEZE 8 "July 2014" "util-linux" "System Administration" +.SH NAME +fsfreeze \- suspend access to a filesystem (Ext3/4, ReiserFS, JFS, XFS) +.SH SYNOPSIS +.B fsfreeze +.BR \--freeze | \--unfreeze +.I mountpoint + +.SH DESCRIPTION +.B fsfreeze +suspends or resumes access to a filesystem. +.PP +.B fsfreeze +halts any new access to the filesystem and creates a stable image on disk. +.B fsfreeze +is intended to be used with hardware RAID devices that support the creation +of snapshots. +.PP +.B fsfreeze +is unnecessary for +.B device-mapper +devices. The device-mapper (and LVM) automatically freezes a filesystem +on the device when a snapshot creation is requested. +For more details see the +.BR dmsetup (8) +man page. +.PP +The +.I mountpoint +argument is the pathname of the directory where the filesystem +is mounted. +The filesystem must be mounted to be frozen (see +.BR mount (8)). +.PP +Note that access-time updates are also suspended if the filesystem is mounted with +the traditional atime behavior (mount option \fBstrictatime\fR, for more details see +.BR mount (8)). + +.SH OPTIONS +.TP +.BR \-f , " \-\-freeze" +This option requests the specified a filesystem to be frozen from new +modifications. When this is selected, all ongoing transactions in the +filesystem are allowed to complete, new write system calls are halted, other +calls which modify the filesystem are halted, and all dirty data, metadata, and +log information are written to disk. Any process attempting to write to the +frozen filesystem will block waiting for the filesystem to be unfrozen. +.sp +Note that even after freezing, the on-disk filesystem can contain +information on files that are still in the process of unlinking. +These files will not be unlinked until the filesystem is unfrozen +or a clean mount of the snapshot is complete. +.TP +.BR \-u , " \-\-unfreeze" +This option is used to un-freeze the filesystem and allow operations to +continue. Any filesystem modifications that were blocked by the freeze are +unblocked and allowed to complete. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH FILESYSTEM SUPPORT +This command will work only if filesystem supports has support for freezing. +List of these filesystems include (2016-12-18) +.BR btrfs , +.BR ext2/3/4 , +.BR f2fs , +.BR jfs , +.BR nilfs2 , +.BR reiserfs , +and +.BR xfs . +Previous list may be incomplete, as more filesystems get support. If in +doubt easiest way to know if a filesystem has support is create a small +loopback mount and test freezing it. +.SH AUTHOR +.PP +Written by Hajime Taira. +.SH NOTES +.PP +This man page is based on +.BR xfs_freeze (8). +.SH SEE ALSO +.BR mount (8) +.SH AVAILABILITY +The fsfreeze command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/fsfreeze.c b/sys-utils/fsfreeze.c new file mode 100644 index 0000000..401ab5c --- /dev/null +++ b/sys-utils/fsfreeze.c @@ -0,0 +1,152 @@ +/* + * fsfreeze.c -- Filesystem freeze/unfreeze IO for Linux + * + * Copyright (C) 2010 Hajime Taira <htaira@redhat.com> + * Masatake Yamato <yamato@redhat.com> + * + * This program is free software. You can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation: either version 1 or + * (at your option) any later version. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <getopt.h> + +#include "c.h" +#include "blkdev.h" +#include "nls.h" +#include "closestream.h" +#include "optutils.h" + +enum fs_operation { + NOOP, + FREEZE, + UNFREEZE +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] <mountpoint>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Suspend access to a filesystem.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -f, --freeze freeze the filesystem\n"), out); + fputs(_(" -u, --unfreeze unfreeze the filesystem\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(19)); + printf(USAGE_MAN_TAIL("fsfreeze(8)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int fd = -1, c; + int action = NOOP, rc = EXIT_FAILURE; + char *path; + struct stat sb; + + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "freeze", no_argument, NULL, 'f' }, + { "unfreeze", no_argument, NULL, 'u' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'f','u' }, /* freeze, unfreeze */ + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "hfuV", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch(c) { + case 'h': + usage(); + break; + case 'f': + action = FREEZE; + break; + case 'u': + action = UNFREEZE; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + exit(EXIT_SUCCESS); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (action == NOOP) + errx(EXIT_FAILURE, _("neither --freeze or --unfreeze specified")); + if (optind == argc) + errx(EXIT_FAILURE, _("no filename specified")); + path = argv[optind++]; + + if (optind != argc) { + warnx(_("unexpected number of arguments")); + errtryhelp(EXIT_FAILURE); + } + + fd = open(path, O_RDONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), path); + + if (fstat(fd, &sb) == -1) { + warn(_("stat of %s failed"), path); + goto done; + } + + if (!S_ISDIR(sb.st_mode)) { + warnx(_("%s: is not a directory"), path); + goto done; + } + + switch (action) { + case FREEZE: + if (ioctl(fd, FIFREEZE, 0)) { + warn(_("%s: freeze failed"), path); + goto done; + } + break; + case UNFREEZE: + if (ioctl(fd, FITHAW, 0)) { + warn(_("%s: unfreeze failed"), path); + goto done; + } + break; + default: + abort(); + } + + rc = EXIT_SUCCESS; +done: + if (fd >= 0) + close(fd); + return rc; +} + diff --git a/sys-utils/fstab.5 b/sys-utils/fstab.5 new file mode 100644 index 0000000..a9e9f8c --- /dev/null +++ b/sys-utils/fstab.5 @@ -0,0 +1,248 @@ +.\" Copyright (c) 1980, 1989, 1991 The Regents of the University of California. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)fstab.5 6.5 (Berkeley) 5/10/91 +.\" +.TH FSTAB 5 "February 2015" "util-linux" "File Formats" +.SH NAME +fstab \- static information about the filesystems +.SH SYNOPSIS +.I /etc/fstab +.SH DESCRIPTION +The file +.B fstab +contains descriptive information about the filesystems the system can mount. +.B fstab +is only read by programs, and not written; it is the duty of the system +administrator to properly create and maintain this file. The order of records in +.B fstab +is important because +.BR fsck (8), +.BR mount (8), +and +.BR umount (8) +sequentially iterate through +.B fstab +doing their thing. + +Each filesystem is described on a separate line. +Fields on each line are separated by tabs or spaces. +Lines starting with '#' are comments. Blank lines are ignored. +.PP +The following is a typical example of an +.B fstab +entry: +.sp +.RS 7 +LABEL=t-home2 /home ext4 defaults,auto_da_alloc 0 2 +.RE + +.B The first field +.RI ( fs_spec ). +.RS +This field describes the block special device or +remote filesystem to be mounted. +.LP +For ordinary mounts, it will hold (a link to) a block special +device node (as created by +.BR mknod (8)) +for the device to be mounted, like `/dev/cdrom' or `/dev/sdb7'. +For NFS mounts, this field is <host>:<dir>, e.g., `knuth.aeb.nl:/'. +For filesystems with no storage, any string can be used, and will show up in +.BR df (1) +output, for example. Typical usage is `proc' for procfs; `mem', `none', +or `tmpfs' for tmpfs. Other special filesystems, like udev and sysfs, +are typically not listed in +.BR fstab . +.LP +LABEL=<label> or UUID=<uuid> may be given instead of a device name. +This is the recommended method, as device names are often a coincidence +of hardware detection order, and can change when other disks are added or removed. +For example, `LABEL=Boot' or `UUID=3e6be9de\%-8139\%-11d1\%-9106\%-a43f08d823a6'. +(Use a filesystem-specific tool like +.BR e2label (8), +.BR xfs_admin (8), +or +.BR fatlabel (8) +to set LABELs on filesystems). + +It's also possible to use PARTUUID= and PARTLABEL=. These partitions identifiers +are supported for example for GUID Partition Table (GPT). + +See +.BR mount (8), +.BR blkid (8) +or +.BR lsblk (8) +for more details about device identifiers. + +.LP +Note that +.BR mount (8) +uses UUIDs as strings. The string representation of the UUID should be based on +lower case characters. +.RE + +.B The second field +.RI ( fs_file ). +.RS +This field describes the mount point (target) for the filesystem. For swap partitions, this +field should be specified as `none'. If the name of the mount point +contains spaces or tabs these can be escaped as `\\040' and '\\011' +respectively. +.RE + +.B The third field +.RI ( fs_vfstype ). +.RS +This field describes the type of the filesystem. Linux supports many +filesystem types: ext4, xfs, btrfs, f2fs, vfat, ntfs, hfsplus, +tmpfs, sysfs, proc, iso9660, udf, squashfs, nfs, cifs, and many more. +For more details, see +.BR mount (8). + +An entry +.I swap +denotes a file or partition to be used +for swapping, cf.\& +.BR swapon (8). +An entry +.I none +is useful for bind or move mounts. + +More than one type may be specified in a comma-separated list. + +.BR mount (8) +and +.BR umount (8) +support filesystem +.IR subtypes . +The subtype is defined by '.subtype' suffix. For +example 'fuse.sshfs'. It's recommended to use subtype notation rather than add +any prefix to the first fstab field (for example 'sshfs#example.com' is +deprecated). +.RE + +.B The fourth field +.RI ( fs_mntops ). +.RS +This field describes the mount options associated with the filesystem. + +It is formatted as a comma-separated list of options. +It contains at least the type of mount +.RB ( ro +or +.BR rw ), +plus any additional options appropriate to the filesystem +type (including performance-tuning options). +For details, see +.BR mount (8) +or +.BR swapon (8). + +Basic filesystem-independent options are: +.TP +.B defaults +use default options: rw, suid, dev, exec, auto, nouser, and async. +.TP +.B noauto +do not mount when "mount -a" is given (e.g., at boot time) +.TP +.B user +allow a user to mount +.TP +.B owner +allow device owner to mount +.TP +.B comment +or +.B x-<name> +for use by fstab-maintaining programs +.TP +.B nofail +do not report errors for this device if it does not exist. +.RE + +.B The fifth field +.RI ( fs_freq ). +.RS +This field is used by +.BR dump (8) +to determine which filesystems need to be dumped. +Defaults to zero (don't dump) if not present. +.RE + +.B The sixth field +.RI ( fs_passno ). +.RS +This field is used by +.BR fsck (8) +to determine the order in which filesystem checks are done at +boot time. The root filesystem should be specified with a +.I fs_passno +of 1. Other filesystems should have a +.I fs_passno +of 2. Filesystems within a drive will be checked sequentially, but +filesystems on different drives will be checked at the same time to utilize +parallelism available in the hardware. +Defaults to zero (don't fsck) if not present. + +.SH NOTES +The proper way to read records from +.B fstab +is to use the routines +.BR getmntent (3) +or +.BR libmount . + +The keyword +.B ignore +as a filesystem type (3rd field) is no longer supported by the pure +libmount based mount utility (since util-linux v2.22). + +.SH FILES +.IR /etc/fstab , +.I <fstab.h> +.SH "SEE ALSO" +.BR getmntent (3), +.BR fs (5), +.BR findmnt (8), +.BR mount (8), +.BR swapon (8) +.SH HISTORY +The ancestor of this +.B fstab +file format appeared in 4.0BSD. +.\" But without comment convention, and options and vfs_type. +.\" Instead there was a type rw/ro/rq/sw/xx, where xx is the present 'ignore'. +.SH AVAILABILITY +This man page is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/fstrim.8 b/sys-utils/fstrim.8 new file mode 100644 index 0000000..ff572a4 --- /dev/null +++ b/sys-utils/fstrim.8 @@ -0,0 +1,131 @@ +.TH FSTRIM 8 "July 2014" "util-linux" "System Administration" +.SH NAME +fstrim \- discard unused blocks on a mounted filesystem +.SH SYNOPSIS +.B fstrim +.RB [ \-Aa ] +.RB [ \-o +.IR offset ] +.RB [ \-l +.IR length ] +.RB [ \-m +.IR minimum-size ] +.RB [ \-v ] +.I mountpoint + +.SH DESCRIPTION +.B fstrim +is used on a mounted filesystem to discard (or "trim") blocks which are not in +use by the filesystem. This is useful for solid-state drives (SSDs) and +thinly-provisioned storage. +.PP +By default, +.B fstrim +will discard all unused blocks in the filesystem. Options may be used to +modify this behavior based on range or size, as explained below. +.PP +The +.I mountpoint +argument is the pathname of the directory where the filesystem +is mounted. +.PP +Running +.B fstrim +frequently, or even using +.BR "mount -o discard" , +might negatively affect the lifetime of poor-quality SSD devices. For most +desktop and server systems a sufficient trimming frequency is once a week. +Note that not all +devices support a queued trim, so each trim command incurs a performance penalty +on whatever else might be trying to use the disk at the time. + +.SH OPTIONS +The \fIoffset\fR, \fIlength\fR, and \fIminimum-size\fR arguments may be +followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" +is optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB and YB. + +.IP "\fB\-A, \-\-fstab\fP" +Trim all mounted filesystems mentioned in \fI/etc/fstab\fR on devices that support the +discard operation. +The other supplied options, like \fB\-\-offset\fR, \fB\-\-length\fR and +\fB-\-minimum\fR, are applied to all these devices. +Errors from filesystems that do not support the discard operation are silently +ignored. + +.IP "\fB\-a, \-\-all\fP" +Trim all mounted filesystems on devices that support the discard operation. +The other supplied options, like \fB\-\-offset\fR, \fB\-\-length\fR and +\fB-\-minimum\fR, are applied to all these devices. +Errors from filesystems that do not support the discard operation are silently +ignored. +.IP "\fB\-n, \-\-dry\-run\fP" +This option does everything apart from actually call FITRIM ioctl. +.IP "\fB\-o, \-\-offset\fP \fIoffset\fP" +Byte offset in the filesystem from which to begin searching for free blocks +to discard. The default value is zero, starting at the beginning of the +filesystem. +.IP "\fB\-l, \-\-length\fP \fIlength\fP" +The number of bytes (after the starting point) to search for free blocks +to discard. If the specified value extends past the end of the filesystem, +.B fstrim +will stop at the filesystem size boundary. The default value extends to +the end of the filesystem. +.IP "\fB\-m, \-\-minimum\fP \fIminimum-size\fP" +Minimum contiguous free range to discard, in bytes. (This value is internally +rounded up to a multiple of the filesystem block size.) Free ranges smaller +than this will be ignored. By increasing this value, the fstrim operation +will complete more quickly for filesystems with badly fragmented freespace, +although not all blocks will be discarded. The default value is zero, +discarding every free block. +.IP "\fB\-v, \-\-verbose\fP" +Verbose execution. With this option +.B fstrim +will output the number of bytes passed from the filesystem +down the block stack to the device for potential discard. This number is a +maximum discard amount from the storage device's perspective, because +.I FITRIM +ioctl called repeated will keep sending the same sectors for discard repeatedly. +.sp +.B fstrim +will report the same potential discard bytes each time, but only sectors which +had been written to between the discards would actually be discarded by the +storage device. Further, the kernel block layer reserves the right to adjust +the discard ranges to fit raid stripe geometry, non-trim capable devices in a +LVM setup, etc. These reductions would not be reflected in fstrim_range.len +(the +.B --length +option). +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH RETURN CODES +.IP 0 +success +.IP 1 +failure +.IP 32 +all failed +.IP 64 +some filesystem discards have succeeded, some failed +.PP +The command +.B fstrim --all +returns 0 (all succeeded), 32 (all failed) or 64 (some failed, some succeeded). + +.SH AUTHOR +.nf +Lukas Czerner <lczerner@redhat.com> +Karel Zak <kzak@redhat.com> +.fi +.SH SEE ALSO +.BR blkdiscard (8), +.BR mount (8) +.SH AVAILABILITY +The fstrim command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/fstrim.c b/sys-utils/fstrim.c new file mode 100644 index 0000000..2a67892 --- /dev/null +++ b/sys-utils/fstrim.c @@ -0,0 +1,417 @@ +/* + * fstrim.c -- discard the part (or whole) of mounted filesystem. + * + * Copyright (C) 2010 Red Hat, Inc. All rights reserved. + * Written by Lukas Czerner <lczerner@redhat.com> + * Karel Zak <kzak@redhat.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * This program uses FITRIM ioctl to discard parts or the whole filesystem + * online (mounted). You can specify range (start and length) to be + * discarded, or simply discard whole filesystem. + */ + +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <fcntl.h> +#include <limits.h> +#include <getopt.h> + +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <linux/fs.h> + +#include "nls.h" +#include "strutils.h" +#include "c.h" +#include "closestream.h" +#include "pathnames.h" +#include "sysfs.h" + +#include <libmount.h> + + +#ifndef FITRIM +struct fstrim_range { + uint64_t start; + uint64_t len; + uint64_t minlen; +}; +#define FITRIM _IOWR('X', 121, struct fstrim_range) +#endif + +struct fstrim_control { + struct fstrim_range range; + + unsigned int verbose : 1, + fstab : 1, + dryrun : 1; +}; + +/* returns: 0 = success, 1 = unsupported, < 0 = error */ +static int fstrim_filesystem(struct fstrim_control *ctl, const char *path, const char *devname) +{ + int fd, rc; + struct stat sb; + struct fstrim_range range; + + /* kernel modifies the range */ + memcpy(&range, &ctl->range, sizeof(range)); + + fd = open(path, O_RDONLY); + if (fd < 0) { + warn(_("cannot open %s"), path); + rc = -errno; + goto done; + } + if (fstat(fd, &sb) == -1) { + warn(_("stat of %s failed"), path); + rc = -errno; + goto done; + } + if (!S_ISDIR(sb.st_mode)) { + warnx(_("%s: not a directory"), path); + rc = -EINVAL; + goto done; + } + + if (ctl->dryrun) { + if (devname) + printf(_("%s: 0 B (dry run) trimmed on %s\n"), path, devname); + else + printf(_("%s: 0 B (dry run) trimmed\n"), path); + rc = 0; + goto done; + } + + errno = 0; + if (ioctl(fd, FITRIM, &range)) { + rc = errno == EOPNOTSUPP || errno == ENOTTY ? 1 : -errno; + + if (rc != 1) + warn(_("%s: FITRIM ioctl failed"), path); + goto done; + } + + if (ctl->verbose) { + char *str = size_to_human_string( + SIZE_SUFFIX_3LETTER | SIZE_SUFFIX_SPACE, + (uint64_t) range.len); + if (devname) + /* TRANSLATORS: The standard value here is a very large number. */ + printf(_("%s: %s (%" PRIu64 " bytes) trimmed on %s\n"), + path, str, (uint64_t) range.len, devname); + else + /* TRANSLATORS: The standard value here is a very large number. */ + printf(_("%s: %s (%" PRIu64 " bytes) trimmed\n"), + path, str, (uint64_t) range.len); + + free(str); + } + + rc = 0; +done: + if (fd >= 0) + close(fd); + return rc; +} + +static int has_discard(const char *devname, struct path_cxt **wholedisk) +{ + struct path_cxt *pc = NULL; + uint64_t dg = 0; + dev_t disk = 0, dev; + int rc = -1; + + dev = sysfs_devname_to_devno(devname); + if (!dev) + goto fail; + + pc = ul_new_sysfs_path(dev, NULL, NULL); + if (!pc) + goto fail; + + /* + * This is tricky to read the info from sys/, because the queue + * attributes are provided for whole devices (disk) only. We're trying + * to reuse the whole-disk sysfs context to optimize this stuff (as + * system usually have just one disk only). + */ + rc = sysfs_blkdev_get_wholedisk(pc, NULL, 0, &disk); + if (rc != 0 || !disk) + goto fail; + + if (dev != disk) { + /* Partition, try reuse whole-disk context if valid for the + * current device, otherwise create new context for the + * whole-disk. + */ + if (*wholedisk && sysfs_blkdev_get_devno(*wholedisk) != disk) { + ul_unref_path(*wholedisk); + *wholedisk = NULL; + } + if (!*wholedisk) { + *wholedisk = ul_new_sysfs_path(disk, NULL, NULL); + if (!*wholedisk) + goto fail; + } + sysfs_blkdev_set_parent(pc, *wholedisk); + } + + rc = ul_path_read_u64(pc, &dg, "queue/discard_granularity"); + + ul_unref_path(pc); + return rc == 0 && dg > 0; +fail: + ul_unref_path(pc); + return 1; +} + + +static int uniq_fs_target_cmp( + struct libmnt_table *tb __attribute__((__unused__)), + struct libmnt_fs *a, + struct libmnt_fs *b) +{ + return !mnt_fs_streq_target(a, mnt_fs_get_target(b)); +} + +static int uniq_fs_source_cmp( + struct libmnt_table *tb __attribute__((__unused__)), + struct libmnt_fs *a, + struct libmnt_fs *b) +{ + if (mnt_fs_is_pseudofs(a) || mnt_fs_is_netfs(a) || + mnt_fs_is_pseudofs(b) || mnt_fs_is_netfs(b)) + return 1; + + return !mnt_fs_streq_srcpath(a, mnt_fs_get_srcpath(b)); +} + +/* + * fstrim --all follows "mount -a" return codes: + * + * 0 = all success + * 32 = all failed + * 64 = some failed, some success + */ +static int fstrim_all(struct fstrim_control *ctl) +{ + struct libmnt_fs *fs; + struct libmnt_iter *itr; + struct libmnt_table *tab; + struct libmnt_cache *cache = NULL; + struct path_cxt *wholedisk = NULL; + int cnt = 0, cnt_err = 0; + const char *filename = _PATH_PROC_MOUNTINFO; + + mnt_init_debug(0); + ul_path_init_debug(); + + itr = mnt_new_iter(MNT_ITER_BACKWARD); + if (!itr) + err(MNT_EX_FAIL, _("failed to initialize libmount iterator")); + + if (ctl->fstab) + filename = mnt_get_fstab_path(); + + tab = mnt_new_table_from_file(filename); + if (!tab) + err(MNT_EX_FAIL, _("failed to parse %s"), filename); + + /* de-duplicate by mountpoints */ + mnt_table_uniq_fs(tab, 0, uniq_fs_target_cmp); + + /* de-duplicate by source */ + mnt_table_uniq_fs(tab, MNT_UNIQ_FORWARD, uniq_fs_source_cmp); + + if (ctl->fstab) { + cache = mnt_new_cache(); + if (!cache) + err(MNT_EX_FAIL, _("failed to initialize libmount cache")); + } + + while (mnt_table_next_fs(tab, itr, &fs) == 0) { + const char *src = mnt_fs_get_srcpath(fs), + *tgt = mnt_fs_get_target(fs); + char *path; + int rc = 1; + + if (!tgt || mnt_fs_is_pseudofs(fs) || mnt_fs_is_netfs(fs)) + continue; + + if (!src && cache) { + /* convert LABEL= (etc.) from fstab to paths */ + const char *spec = mnt_fs_get_source(fs); + + if (!spec) + continue; + src = mnt_resolve_spec(spec, cache); + } + + if (!src || *src != '/') + continue; + + /* Is it really accessible mountpoint? Not all mountpoints are + * accessible (maybe over mounted by another filesystem) */ + path = mnt_get_mountpoint(tgt); + if (path && strcmp(path, tgt) == 0) + rc = 0; + free(path); + if (rc) + continue; /* overlaying mount */ + + if (!has_discard(src, &wholedisk)) + continue; + cnt++; + + /* + * We're able to detect that the device supports discard, but + * things also depend on filesystem or device mapping, for + * example vfat or LUKS (by default) does not support FSTRIM. + * + * This is reason why we ignore EOPNOTSUPP and ENOTTY errors + * from discard ioctl. + */ + if (fstrim_filesystem(ctl, tgt, src) < 0) + cnt_err++; + } + + ul_unref_path(wholedisk); + mnt_unref_table(tab); + mnt_free_iter(itr); + mnt_unref_cache(cache); + + if (cnt && cnt == cnt_err) + return MNT_EX_FAIL; /* all failed */ + if (cnt && cnt_err) + return MNT_EX_SOMEOK; /* some ok */ + + return MNT_EX_SUCCESS; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] <mount point>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Discard unused blocks on a mounted filesystem.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all trim all supported mounted filesystems\n"), out); + fputs(_(" -A, --fstab trim all supported mounted filesystems from /etc/fstab\n"), out); + fputs(_(" -o, --offset <num> the offset in bytes to start discarding from\n"), out); + fputs(_(" -l, --length <num> the number of bytes to discard\n"), out); + fputs(_(" -m, --minimum <num> the minimum extent length to discard\n"), out); + fputs(_(" -v, --verbose print number of discarded bytes\n"), out); + fputs(_(" -n, --dry-run does everything, but trim\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(21)); + printf(USAGE_MAN_TAIL("fstrim(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + char *path = NULL; + int c, rc, all = 0; + struct fstrim_control ctl = { + .range = { .len = ULLONG_MAX } + }; + + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "fstab", no_argument, NULL, 'A' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "offset", required_argument, NULL, 'o' }, + { "length", required_argument, NULL, 'l' }, + { "minimum", required_argument, NULL, 'm' }, + { "verbose", no_argument, NULL, 'v' }, + { "dry-run", no_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "Aahl:m:no:Vv", longopts, NULL)) != -1) { + switch(c) { + case 'A': + ctl.fstab = 1; + /* fallthrough */ + case 'a': + all = 1; + break; + case 'n': + ctl.dryrun = 1; + break; + case 'h': + usage(); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'l': + ctl.range.len = strtosize_or_err(optarg, + _("failed to parse length")); + break; + case 'o': + ctl.range.start = strtosize_or_err(optarg, + _("failed to parse offset")); + break; + case 'm': + ctl.range.minlen = strtosize_or_err(optarg, + _("failed to parse minimum extent length")); + break; + case 'v': + ctl.verbose = 1; + break; + default: + errtryhelp(EXIT_FAILURE); + break; + } + } + + if (!all) { + if (optind == argc) + errx(EXIT_FAILURE, _("no mountpoint specified")); + path = argv[optind++]; + } + + if (optind != argc) { + warnx(_("unexpected number of arguments")); + errtryhelp(EXIT_FAILURE); + } + + if (all) + return fstrim_all(&ctl); /* MNT_EX_* codes */ + + rc = fstrim_filesystem(&ctl, path, NULL); + if (rc == 1) + warnx(_("%s: the discard operation is not supported"), path); + + return rc == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/sys-utils/fstrim.service.in b/sys-utils/fstrim.service.in new file mode 100644 index 0000000..2d5daf9 --- /dev/null +++ b/sys-utils/fstrim.service.in @@ -0,0 +1,7 @@ +[Unit] +Description=Discard unused blocks on filesystems from /etc/fstab +Documentation=man:fstrim(8) + +[Service] +Type=oneshot +ExecStart=@sbindir@/fstrim -Av diff --git a/sys-utils/fstrim.timer b/sys-utils/fstrim.timer new file mode 100644 index 0000000..3a3762d --- /dev/null +++ b/sys-utils/fstrim.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Discard unused blocks once a week +Documentation=man:fstrim + +[Timer] +OnCalendar=weekly +AccuracySec=1h +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/sys-utils/hwclock-cmos.c b/sys-utils/hwclock-cmos.c new file mode 100644 index 0000000..a11f676 --- /dev/null +++ b/sys-utils/hwclock-cmos.c @@ -0,0 +1,420 @@ +/* + * i386 CMOS starts out with 14 bytes clock data alpha has something + * similar, but with details depending on the machine type. + * + * byte 0: seconds 0-59 + * byte 2: minutes 0-59 + * byte 4: hours 0-23 in 24hr mode, + * 1-12 in 12hr mode, with high bit unset/set + * if am/pm. + * byte 6: weekday 1-7, Sunday=1 + * byte 7: day of the month 1-31 + * byte 8: month 1-12 + * byte 9: year 0-99 + * + * Numbers are stored in BCD/binary if bit 2 of byte 11 is unset/set The + * clock is in 12hr/24hr mode if bit 1 of byte 11 is unset/set The clock is + * undefined (being updated) if bit 7 of byte 10 is set. The clock is frozen + * (to be updated) by setting bit 7 of byte 11 Bit 7 of byte 14 indicates + * whether the CMOS clock is reliable: it is 1 if RTC power has been good + * since this bit was last read; it is 0 when the battery is dead and system + * power has been off. + * + * Avoid setting the RTC clock within 2 seconds of the day rollover that + * starts a new month or enters daylight saving time. + * + * The century situation is messy: + * + * Usually byte 50 (0x32) gives the century (in BCD, so 19 or 20 hex), but + * IBM PS/2 has (part of) a checksum there and uses byte 55 (0x37). + * Sometimes byte 127 (0x7f) or Bank 1, byte 0x48 gives the century. The + * original RTC will not access any century byte; some modern versions will. + * If a modern RTC or BIOS increments the century byte it may go from 0x19 + * to 0x20, but in some buggy cases 0x1a is produced. + */ +/* + * A struct tm has int fields + * tm_sec 0-59, 60 or 61 only for leap seconds + * tm_min 0-59 + * tm_hour 0-23 + * tm_mday 1-31 + * tm_mon 0-11 + * tm_year number of years since 1900 + * tm_wday 0-6, 0=Sunday + * tm_yday 0-365 + * tm_isdst >0: yes, 0: no, <0: unknown + */ + +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include "c.h" +#include "nls.h" +#include "pathnames.h" + +/* for inb, outb */ +#if defined(__i386__) || defined(__x86_64__) +# ifdef HAVE_SYS_IO_H +# include <sys/io.h> +# elif defined(HAVE_ASM_IO_H) +# include <asm/io.h> +# else +# undef __i386__ +# undef __x86_64__ +# warning "disable cmos access - no sys/io.h or asm/io.h" +static void outb(int a __attribute__((__unused__)), + int b __attribute__((__unused__))) +{ +} + +static int inb(int c __attribute__((__unused__))) +{ + return 0; +} +# endif /* __i386__ __x86_64__ */ +#else +# warning "disable cmos access - not i386 or x86_64" +static void outb(int a __attribute__((__unused__)), + int b __attribute__((__unused__))) +{ +} + +static int inb(int c __attribute__((__unused__))) +{ + return 0; +} +#endif /* for inb, outb */ + +#include "hwclock.h" + +#define BCD_TO_BIN(val) ((val)=((val)&15) + ((val)>>4)*10) +#define BIN_TO_BCD(val) ((val)=(((val)/10)<<4) + (val)%10) + +#define IOPL_NOT_IMPLEMENTED -2 + +/* + * POSIX uses 1900 as epoch for a struct tm, and 1970 for a time_t. + */ +#define TM_EPOCH 1900 + +static unsigned short clock_ctl_addr = 0x70; +static unsigned short clock_data_addr = 0x71; + +/* + * Hmmh, this isn't very atomic. Maybe we should force an error instead? + * + * TODO: optimize the access to CMOS by mlockall(MCL_CURRENT) and SCHED_FIFO + */ +static unsigned long atomic(unsigned long (*op) (unsigned long), + unsigned long arg) +{ + return (*op) (arg); +} + +/* + * We only want to read CMOS data, but unfortunately writing to bit 7 + * disables (1) or enables (0) NMI; since this bit is read-only we have + * to guess the old status. Various docs suggest that one should disable + * NMI while reading/writing CMOS data, and enable it again afterwards. + * This would yield the sequence + * + * outb (reg | 0x80, 0x70); + * val = inb(0x71); + * outb (0x0d, 0x70); // 0x0d: random read-only location + * + * Other docs state that "any write to 0x70 should be followed by an + * action to 0x71 or the RTC will be left in an unknown state". Most + * docs say that it doesn't matter at all what one does. + * + * bit 0x80: disable NMI while reading - should we? Let us follow the + * kernel and not disable. Called only with 0 <= reg < 128 + */ + +static inline unsigned long cmos_read(unsigned long reg) +{ + outb(reg, clock_ctl_addr); + return inb(clock_data_addr); +} + +static inline unsigned long cmos_write(unsigned long reg, unsigned long val) +{ + outb(reg, clock_ctl_addr); + outb(val, clock_data_addr); + return 0; +} + +static unsigned long cmos_set_time(unsigned long arg) +{ + unsigned char save_control, save_freq_select, pmbit = 0; + struct tm tm = *(struct tm *)arg; + +/* + * CMOS byte 10 (clock status register A) has 3 bitfields: + * bit 7: 1 if data invalid, update in progress (read-only bit) + * (this is raised 224 us before the actual update starts) + * 6-4 select base frequency + * 010: 32768 Hz time base (default) + * 111: reset + * all other combinations are manufacturer-dependent + * (e.g.: DS1287: 010 = start oscillator, anything else = stop) + * 3-0 rate selection bits for interrupt + * 0000 none (may stop RTC) + * 0001, 0010 give same frequency as 1000, 1001 + * 0011 122 microseconds (minimum, 8192 Hz) + * .... each increase by 1 halves the frequency, doubles the period + * 1111 500 milliseconds (maximum, 2 Hz) + * 0110 976.562 microseconds (default 1024 Hz) + */ + save_control = cmos_read(11); /* tell the clock it's being set */ + cmos_write(11, (save_control | 0x80)); + save_freq_select = cmos_read(10); /* stop and reset prescaler */ + cmos_write(10, (save_freq_select | 0x70)); + + tm.tm_year %= 100; + tm.tm_mon += 1; + tm.tm_wday += 1; + + if (!(save_control & 0x02)) { /* 12hr mode; the default is 24hr mode */ + if (tm.tm_hour == 0) + tm.tm_hour = 24; + if (tm.tm_hour > 12) { + tm.tm_hour -= 12; + pmbit = 0x80; + } + } + + if (!(save_control & 0x04)) { /* BCD mode - the default */ + BIN_TO_BCD(tm.tm_sec); + BIN_TO_BCD(tm.tm_min); + BIN_TO_BCD(tm.tm_hour); + BIN_TO_BCD(tm.tm_wday); + BIN_TO_BCD(tm.tm_mday); + BIN_TO_BCD(tm.tm_mon); + BIN_TO_BCD(tm.tm_year); + } + + cmos_write(0, tm.tm_sec); + cmos_write(2, tm.tm_min); + cmos_write(4, tm.tm_hour | pmbit); + cmos_write(6, tm.tm_wday); + cmos_write(7, tm.tm_mday); + cmos_write(8, tm.tm_mon); + cmos_write(9, tm.tm_year); + + /* + * The kernel sources, linux/arch/i386/kernel/time.c, have the + * following comment: + * + * The following flags have to be released exactly in this order, + * otherwise the DS12887 (popular MC146818A clone with integrated + * battery and quartz) will not reset the oscillator and will not + * update precisely 500 ms later. You won't find this mentioned in + * the Dallas Semiconductor data sheets, but who believes data + * sheets anyway ... -- Markus Kuhn + */ + cmos_write(11, save_control); + cmos_write(10, save_freq_select); + return 0; +} + +static int hclock_read(unsigned long reg) +{ + return atomic(cmos_read, reg); +} + +static void hclock_set_time(const struct tm *tm) +{ + atomic(cmos_set_time, (unsigned long)(tm)); +} + +static inline int cmos_clock_busy(void) +{ + return + /* poll bit 7 (UIP) of Control Register A */ + (hclock_read(10) & 0x80); +} + +static int synchronize_to_clock_tick_cmos(const struct hwclock_control *ctl + __attribute__((__unused__))) +{ + int i; + + /* + * Wait for rise. Should be within a second, but in case something + * weird happens, we have a limit on this loop to reduce the impact + * of this failure. + */ + for (i = 0; !cmos_clock_busy(); i++) + if (i >= 10000000) + return 1; + + /* Wait for fall. Should be within 2.228 ms. */ + for (i = 0; cmos_clock_busy(); i++) + if (i >= 1000000) + return 1; + return 0; +} + +/* + * Read the hardware clock and return the current time via <tm> argument. + * Assume we have an ISA machine and read the clock directly with CPU I/O + * instructions. + * + * This function is not totally reliable. It takes a finite and + * unpredictable amount of time to execute the code below. During that time, + * the clock may change and we may even read an invalid value in the middle + * of an update. We do a few checks to minimize this possibility, but only + * the kernel can actually read the clock properly, since it can execute + * code in a short and predictable amount of time (by turning of + * interrupts). + * + * In practice, the chance of this function returning the wrong time is + * extremely remote. + */ +static int read_hardware_clock_cmos(const struct hwclock_control *ctl + __attribute__((__unused__)), struct tm *tm) +{ + unsigned char status = 0, pmbit = 0; + + while (1) { + /* + * Bit 7 of Byte 10 of the Hardware Clock value is the + * Update In Progress (UIP) bit, which is on while and 244 + * uS before the Hardware Clock updates itself. It updates + * the counters individually, so reading them during an + * update would produce garbage. The update takes 2mS, so we + * could be spinning here that long waiting for this bit to + * turn off. + * + * Furthermore, it is pathologically possible for us to be + * in this code so long that even if the UIP bit is not on + * at first, the clock has changed while we were running. We + * check for that too, and if it happens, we start over. + */ + if (!cmos_clock_busy()) { + /* No clock update in progress, go ahead and read */ + tm->tm_sec = hclock_read(0); + tm->tm_min = hclock_read(2); + tm->tm_hour = hclock_read(4); + tm->tm_wday = hclock_read(6); + tm->tm_mday = hclock_read(7); + tm->tm_mon = hclock_read(8); + tm->tm_year = hclock_read(9); + status = hclock_read(11); + /* + * Unless the clock changed while we were reading, + * consider this a good clock read . + */ + if (tm->tm_sec == hclock_read(0)) + break; + } + /* + * Yes, in theory we could have been running for 60 seconds + * and the above test wouldn't work! + */ + } + + if (!(status & 0x04)) { /* BCD mode - the default */ + BCD_TO_BIN(tm->tm_sec); + BCD_TO_BIN(tm->tm_min); + pmbit = (tm->tm_hour & 0x80); + tm->tm_hour &= 0x7f; + BCD_TO_BIN(tm->tm_hour); + BCD_TO_BIN(tm->tm_wday); + BCD_TO_BIN(tm->tm_mday); + BCD_TO_BIN(tm->tm_mon); + BCD_TO_BIN(tm->tm_year); + } + + /* + * We don't use the century byte of the Hardware Clock since we + * don't know its address (usually 50 or 55). Here, we follow the + * advice of the X/Open Base Working Group: "if century is not + * specified, then values in the range [69-99] refer to years in the + * twentieth century (1969 to 1999 inclusive), and values in the + * range [00-68] refer to years in the twenty-first century (2000 to + * 2068 inclusive)." + */ + tm->tm_wday -= 1; + tm->tm_mon -= 1; + if (tm->tm_year < 69) + tm->tm_year += 100; + if (pmbit) { + tm->tm_hour += 12; + if (tm->tm_hour == 24) + tm->tm_hour = 0; + } + + tm->tm_isdst = -1; /* don't know whether it's daylight */ + return 0; +} + +static int set_hardware_clock_cmos(const struct hwclock_control *ctl + __attribute__((__unused__)), + const struct tm *new_broken_time) +{ + hclock_set_time(new_broken_time); + return 0; +} + +#if defined(__i386__) || defined(__x86_64__) +# if defined(HAVE_IOPL) +static int i386_iopl(const int level) +{ + return iopl(level); +} +# else +static int i386_iopl(const int level __attribute__ ((__unused__))) +{ + extern int ioperm(unsigned long from, unsigned long num, int turn_on); + return ioperm(clock_ctl_addr, 2, 1); +} +# endif +#else +static int i386_iopl(const int level __attribute__ ((__unused__))) +{ + return IOPL_NOT_IMPLEMENTED; +} +#endif + +static int get_permissions_cmos(void) +{ + int rc; + + rc = i386_iopl(3); + if (rc == IOPL_NOT_IMPLEMENTED) { + warnx(_("ISA port access is not implemented")); + } else if (rc != 0) { + warn(_("iopl() port access failed")); + } + return rc; +} + +static const char *get_device_path(void) +{ + return NULL; +} + +static struct clock_ops cmos_interface = { + N_("Using direct ISA access to the clock"), + get_permissions_cmos, + read_hardware_clock_cmos, + set_hardware_clock_cmos, + synchronize_to_clock_tick_cmos, + get_device_path, +}; + +/* + * return &cmos if cmos clock present, NULL otherwise. + */ +struct clock_ops *probe_for_cmos_clock(void) +{ +#if defined(__i386__) || defined(__x86_64__) + return &cmos_interface; +#else + return NULL; +#endif +} diff --git a/sys-utils/hwclock-rtc.c b/sys-utils/hwclock-rtc.c new file mode 100644 index 0000000..32feb35 --- /dev/null +++ b/sys-utils/hwclock-rtc.c @@ -0,0 +1,448 @@ +/* + * rtc.c - Use /dev/rtc for clock access + */ +#include <asm/ioctl.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/select.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> + +#include "nls.h" + +#include "hwclock.h" + +/* + * Get defines for rtc stuff. + * + * Getting the rtc defines is nontrivial. The obvious way is by including + * <linux/mc146818rtc.h> but that again includes <asm/io.h> which again + * includes ... and on sparc and alpha this gives compilation errors for + * many kernel versions. So, we give the defines ourselves here. Moreover, + * some Sparc person decided to be incompatible, and used a struct rtc_time + * different from that used in mc146818rtc.h. + */ + +/* + * On Sparcs, there is a <asm/rtc.h> that defines different ioctls (that are + * required on my machine). However, this include file does not exist on + * other architectures. + */ +/* One might do: +#ifdef __sparc__ +# include <asm/rtc.h> +#endif + */ +#ifdef __sparc__ +/* The following is roughly equivalent */ +struct sparc_rtc_time +{ + int sec; /* Seconds 0-59 */ + int min; /* Minutes 0-59 */ + int hour; /* Hour 0-23 */ + int dow; /* Day of the week 1-7 */ + int dom; /* Day of the month 1-31 */ + int month; /* Month of year 1-12 */ + int year; /* Year 0-99 */ +}; +#define RTCGET _IOR('p', 20, struct sparc_rtc_time) +#define RTCSET _IOW('p', 21, struct sparc_rtc_time) +#endif + +/* + * struct rtc_time is present since 1.3.99. + * Earlier (since 1.3.89), a struct tm was used. + */ +struct linux_rtc_time { + int tm_sec; + int tm_min; + int tm_hour; + int tm_mday; + int tm_mon; + int tm_year; + int tm_wday; + int tm_yday; + int tm_isdst; +}; + +/* RTC_RD_TIME etc have this definition since 1.99.9 (pre2.0-9) */ +#ifndef RTC_RD_TIME +# define RTC_RD_TIME _IOR('p', 0x09, struct linux_rtc_time) +# define RTC_SET_TIME _IOW('p', 0x0a, struct linux_rtc_time) +# define RTC_UIE_ON _IO('p', 0x03) /* Update int. enable on */ +# define RTC_UIE_OFF _IO('p', 0x04) /* Update int. enable off */ +#endif + +/* RTC_EPOCH_READ and RTC_EPOCH_SET are present since 2.0.34 and 2.1.89 */ +#ifndef RTC_EPOCH_READ +# define RTC_EPOCH_READ _IOR('p', 0x0d, unsigned long) /* Read epoch */ +# define RTC_EPOCH_SET _IOW('p', 0x0e, unsigned long) /* Set epoch */ +#endif + +/* + * /dev/rtc is conventionally chardev 10/135 + * ia64 uses /dev/efirtc, chardev 10/136 + * devfs (obsolete) used /dev/misc/... for miscdev + * new RTC framework + udev uses dynamic major and /dev/rtc0.../dev/rtcN + * ... so we need an overridable default + */ + +/* default or user defined dev (by hwclock --rtc=<path>) */ +static const char *rtc_dev_name; +static int rtc_dev_fd = -1; + +static void close_rtc(void) +{ + if (rtc_dev_fd != -1) + close(rtc_dev_fd); + rtc_dev_fd = -1; +} + +static int open_rtc(const struct hwclock_control *ctl) +{ + static const char *fls[] = { +#ifdef __ia64__ + "/dev/efirtc", + "/dev/misc/efirtc", +#endif + "/dev/rtc0", + "/dev/rtc", + "/dev/misc/rtc" + }; + size_t i; + + if (rtc_dev_fd != -1) + return rtc_dev_fd; + + /* --rtc option has been given */ + if (ctl->rtc_dev_name) { + rtc_dev_name = ctl->rtc_dev_name; + rtc_dev_fd = open(rtc_dev_name, O_RDONLY); + } else { + for (i = 0; i < ARRAY_SIZE(fls); i++) { + if (ctl->verbose) + printf(_("Trying to open: %s\n"), fls[i]); + rtc_dev_fd = open(fls[i], O_RDONLY); + + if (rtc_dev_fd < 0 + && (errno == ENOENT || errno == ENODEV)) + continue; + rtc_dev_name = fls[i]; + break; + } + if (rtc_dev_fd < 0) + rtc_dev_name = *fls; /* default for error messages */ + } + if (rtc_dev_fd != -1) + atexit(close_rtc); + return rtc_dev_fd; +} + +static int open_rtc_or_exit(const struct hwclock_control *ctl) +{ + int rtc_fd = open_rtc(ctl); + + if (rtc_fd < 0) { + warn(_("cannot open rtc device")); + hwclock_exit(ctl, EXIT_FAILURE); + } + return rtc_fd; +} + +static int do_rtc_read_ioctl(int rtc_fd, struct tm *tm) +{ + int rc = -1; + char *ioctlname; +#ifdef __sparc__ + /* some but not all sparcs use a different ioctl and struct */ + struct sparc_rtc_time stm; +#endif + + ioctlname = "RTC_RD_TIME"; + rc = ioctl(rtc_fd, RTC_RD_TIME, tm); + +#ifdef __sparc__ + if (rc == -1) { /* sparc sbus */ + ioctlname = "RTCGET"; + rc = ioctl(rtc_fd, RTCGET, &stm); + if (rc == 0) { + tm->tm_sec = stm.sec; + tm->tm_min = stm.min; + tm->tm_hour = stm.hour; + tm->tm_mday = stm.dom; + tm->tm_mon = stm.month - 1; + tm->tm_year = stm.year - 1900; + tm->tm_wday = stm.dow - 1; + tm->tm_yday = -1; /* day in the year */ + } + } +#endif + + if (rc == -1) { + warn(_("ioctl(%s) to %s to read the time failed"), + ioctlname, rtc_dev_name); + return -1; + } + + tm->tm_isdst = -1; /* don't know whether it's dst */ + return 0; +} + +/* + * Wait for the top of a clock tick by reading /dev/rtc in a busy loop + * until we see it. This function is used for rtc drivers without ioctl + * interrupts. This is typical on an Alpha, where the Hardware Clock + * interrupts are used by the kernel for the system clock, so aren't at + * the user's disposal. + */ +static int busywait_for_rtc_clock_tick(const struct hwclock_control *ctl, + const int rtc_fd) +{ + struct tm start_time; + /* The time when we were called (and started waiting) */ + struct tm nowtime; + int rc; + struct timeval begin, now; + + if (ctl->verbose) { + printf("ioctl(%d, RTC_UIE_ON, 0): %s\n", + rtc_fd, strerror(errno)); + printf(_("Waiting in loop for time from %s to change\n"), + rtc_dev_name); + } + + if (do_rtc_read_ioctl(rtc_fd, &start_time)) + return 1; + + /* + * Wait for change. Should be within a second, but in case + * something weird happens, we have a time limit (1.5s) on this loop + * to reduce the impact of this failure. + */ + gettimeofday(&begin, NULL); + do { + rc = do_rtc_read_ioctl(rtc_fd, &nowtime); + if (rc || start_time.tm_sec != nowtime.tm_sec) + break; + gettimeofday(&now, NULL); + if (time_diff(now, begin) > 1.5) { + warnx(_("Timed out waiting for time change.")); + return 1; + } + } while (1); + + if (rc) + return 1; + return 0; +} + +/* + * Same as synchronize_to_clock_tick(), but just for /dev/rtc. + */ +static int synchronize_to_clock_tick_rtc(const struct hwclock_control *ctl) +{ + int rtc_fd; /* File descriptor of /dev/rtc */ + int ret = 1; + + rtc_fd = open_rtc(ctl); + if (rtc_fd == -1) { + warn(_("cannot open rtc device")); + return ret; + } else { + /* Turn on update interrupts (one per second) */ + int rc = ioctl(rtc_fd, RTC_UIE_ON, 0); + + if (rc != -1) { + /* + * Just reading rtc_fd fails on broken hardware: no + * update interrupt comes and a bootscript with a + * hwclock call hangs + */ + fd_set rfds; + struct timeval tv; + + /* + * Wait up to ten seconds for the next update + * interrupt + */ + FD_ZERO(&rfds); + FD_SET(rtc_fd, &rfds); + tv.tv_sec = 10; + tv.tv_usec = 0; + rc = select(rtc_fd + 1, &rfds, NULL, NULL, &tv); + if (0 < rc) + ret = 0; + else if (rc == 0) { + warnx(_("select() to %s to wait for clock tick timed out"), + rtc_dev_name); + } else + warn(_("select() to %s to wait for clock tick failed"), + rtc_dev_name); + /* Turn off update interrupts */ + rc = ioctl(rtc_fd, RTC_UIE_OFF, 0); + if (rc == -1) + warn(_("ioctl() to %s to turn off update interrupts failed"), + rtc_dev_name); + } else if (errno == ENOTTY || errno == EINVAL) { + /* rtc ioctl interrupts are unimplemented */ + ret = busywait_for_rtc_clock_tick(ctl, rtc_fd); + } else + warn(_("ioctl(%d, RTC_UIE_ON, 0) to %s failed"), + rtc_fd, rtc_dev_name); + } + return ret; +} + +static int read_hardware_clock_rtc(const struct hwclock_control *ctl, + struct tm *tm) +{ + int rtc_fd, rc; + + rtc_fd = open_rtc_or_exit(ctl); + + /* Read the RTC time/date, return answer via tm */ + rc = do_rtc_read_ioctl(rtc_fd, tm); + + return rc; +} + +/* + * Set the Hardware Clock to the broken down time <new_broken_time>. Use + * ioctls to "rtc" device /dev/rtc. + */ +static int set_hardware_clock_rtc(const struct hwclock_control *ctl, + const struct tm *new_broken_time) +{ + int rc = -1; + int rtc_fd; + char *ioctlname; + + rtc_fd = open_rtc_or_exit(ctl); + + ioctlname = "RTC_SET_TIME"; + rc = ioctl(rtc_fd, RTC_SET_TIME, new_broken_time); + +#ifdef __sparc__ + if (rc == -1) { /* sparc sbus */ + struct sparc_rtc_time stm; + + stm.sec = new_broken_time->tm_sec; + stm.min = new_broken_time->tm_min; + stm.hour = new_broken_time->tm_hour; + stm.dom = new_broken_time->tm_mday; + stm.month = new_broken_time->tm_mon + 1; + stm.year = new_broken_time->tm_year + 1900; + stm.dow = new_broken_time->tm_wday + 1; + + ioctlname = "RTCSET"; + rc = ioctl(rtc_fd, RTCSET, &stm); + } +#endif + + if (rc == -1) { + warn(_("ioctl(%s) to %s to set the time failed"), + ioctlname, rtc_dev_name); + hwclock_exit(ctl, EXIT_FAILURE); + } + + if (ctl->verbose) + printf(_("ioctl(%s) was successful.\n"), ioctlname); + + return 0; +} + +static int get_permissions_rtc(void) +{ + return 0; +} + +static const char *get_device_path(void) +{ + return rtc_dev_name; +} + +static struct clock_ops rtc_interface = { + N_("Using the rtc interface to the clock."), + get_permissions_rtc, + read_hardware_clock_rtc, + set_hardware_clock_rtc, + synchronize_to_clock_tick_rtc, + get_device_path, +}; + +/* return &rtc if /dev/rtc can be opened, NULL otherwise */ +struct clock_ops *probe_for_rtc_clock(const struct hwclock_control *ctl) +{ + const int rtc_fd = open_rtc(ctl); + + if (rtc_fd < 0) + return NULL; + return &rtc_interface; +} + +#ifdef __alpha__ +/* + * Get the Hardware Clock epoch setting from the kernel. + */ +int get_epoch_rtc(const struct hwclock_control *ctl, unsigned long *epoch_p) +{ + int rtc_fd; + + rtc_fd = open_rtc(ctl); + if (rtc_fd < 0) { + warn(_("cannot open %s"), rtc_dev_name); + return 1; + } + + if (ioctl(rtc_fd, RTC_EPOCH_READ, epoch_p) == -1) { + warn(_("ioctl(%d, RTC_EPOCH_READ, epoch_p) to %s failed"), + rtc_fd, rtc_dev_name); + return 1; + } + + if (ctl->verbose) + printf(_("ioctl(%d, RTC_EPOCH_READ, epoch_p) to %s succeeded.\n"), + rtc_fd, rtc_dev_name); + + return 0; +} + +/* + * Set the Hardware Clock epoch in the kernel. + */ +int set_epoch_rtc(const struct hwclock_control *ctl) +{ + int rtc_fd; + unsigned long epoch; + + epoch = strtoul(ctl->epoch_option, NULL, 10); + + /* There were no RTC clocks before 1900. */ + if (epoch < 1900 || epoch == ULONG_MAX) { + warnx(_("invalid epoch '%s'."), ctl->epoch_option); + return 1; + } + + rtc_fd = open_rtc(ctl); + if (rtc_fd < 0) { + warn(_("cannot open %s"), rtc_dev_name); + return 1; + } + + if (ioctl(rtc_fd, RTC_EPOCH_SET, epoch) == -1) { + warn(_("ioctl(%d, RTC_EPOCH_SET, %lu) to %s failed"), + rtc_fd, epoch, rtc_dev_name); + return 1; + } + + if (ctl->verbose) + printf(_("ioctl(%d, RTC_EPOCH_SET, %lu) to %s succeeded.\n"), + rtc_fd, epoch, rtc_dev_name); + + return 0; +} +#endif /* __alpha__ */ diff --git a/sys-utils/hwclock.8 b/sys-utils/hwclock.8 new file mode 100644 index 0000000..8a10e7a --- /dev/null +++ b/sys-utils/hwclock.8 @@ -0,0 +1,998 @@ +.\" hwclock.8.in -- man page for util-linux' hwclock +.\" +.\" 2015-01-07 J William Piggott +.\" Authored new section: DATE-TIME CONFIGURATION. +.\" Subsections: Keeping Time..., LOCAL vs UTC, POSIX vs 'RIGHT'. +.\" +.TH HWCLOCK 8 "July 2017" "util-linux" "System Administration" +.SH NAME +hwclock \- time clocks utility +.SH SYNOPSIS +.B hwclock +.RI [ function ] +.RI [ option ...] +. +.SH DESCRIPTION +.B hwclock +is an administration tool for the time clocks. It can: display the +Hardware Clock time; set the Hardware Clock to a specified time; set the +Hardware Clock from the System Clock; set the System Clock from the +Hardware Clock; compensate for Hardware Clock drift; correct the System +Clock timescale; set the kernel's timezone, NTP timescale, and epoch +(Alpha only); and predict future +Hardware Clock values based on its drift rate. +.PP +Since v2.26 important changes were made to the +.B \-\-hctosys +function and the +.B \-\-directisa +option, and a new option +.B \-\-update\-drift +was added. See their respective descriptions below. +. +.SH FUNCTIONS +The following functions are mutually exclusive, only one can be given at +a time. If none is given, the default is \fB\-\-show\fR. +.TP +.B \-a, \-\-adjust +Add or subtract time from the Hardware Clock to account for systematic +drift since the last time the clock was set or adjusted. See the +discussion below, under +.BR "The Adjust Function" . +. +.TP +.B \-\-getepoch +.TQ +.B \-\-setepoch +These functions are for Alpha machines only, and are only available +through the Linux kernel RTC driver. +.sp +They are used to read and set the kernel's Hardware Clock epoch value. +Epoch is the number of years into AD to which a zero year value in the +Hardware Clock refers. For example, if the machine's BIOS sets the year +counter in the Hardware Clock to contain the number of full years since +1952, then the kernel's Hardware Clock epoch value must be 1952. +.sp +The \fB\%\-\-setepoch\fR function requires using the +.B \%\-\-epoch +option to specify the year. For example: +.RS +.IP "" 4 +.B hwclock\ \-\-setepoch\ \-\-epoch=1952 +.PP +The RTC driver attempts to guess the correct epoch value, so setting it +may not be required. +.PP +This epoch value is used whenever +.B \%hwclock +reads or sets the Hardware Clock on an Alpha machine. For ISA machines +the kernel uses the fixed Hardware Clock epoch of 1900. +.RE +. +.TP +.B \-\-predict +Predict what the Hardware Clock will read in the future based upon the +time given by the +.B \-\-date +option and the information in +.IR /etc/adjtime . +This is useful, for example, to account for drift when setting a +Hardware Clock wakeup (aka alarm). See +.BR \%rtcwake (8). +.sp +Do not use this function if the Hardware Clock is being modified by +anything other than the current operating system's +.B \%hwclock +command, such as \%'11\ minute\ mode' or from dual-booting another OS. +. +.TP +.BR \-r , \ \-\-show +.TQ +.B \-\-get +.br +Read the Hardware Clock and print its time to standard output in the +.B ISO 8601 +format. +The time shown is always in local time, even if you keep your Hardware Clock +in UTC. See the +.B \%\-\-localtime +option. +.sp +Showing the Hardware Clock time is the default when no function is specified. +.sp +The +.B \-\-get +function also applies drift correction to the time read, based upon the +information in +.IR /etc/adjtime . +Do not use this function if the Hardware Clock is being modified by +anything other than the current operating system's +.B \%hwclock +command, such as \%'11\ minute\ mode' or from dual-booting another OS. +. +.TP +.BR \-s , \ \-\-hctosys +Set the System Clock from the Hardware Clock. The time read from the Hardware +Clock is compensated to account for systematic drift before using it to set the +System Clock. See the discussion below, under +.BR "The Adjust Function" . +.sp +The System Clock must be kept in the UTC timescale for date-time +applications to work correctly in conjunction with the timezone configured +for the system. If the Hardware Clock is kept in local time then the time read +from it must be shifted to the UTC timescale before using it to set the System +Clock. The +.B \%\-\-hctosys +function does this based upon the information in the +.I /etc/adjtime +file or the command line arguments +.BR \%\-\-localtime " and " \-\-utc . +Note: no daylight saving adjustment is made. See the discussion below, under +.BR "LOCAL vs UTC" . +.sp +The kernel also keeps a timezone value, the +.B \%\-\-hctosys +function sets it to the timezone configured for the system. The system +timezone is configured by the TZ environment variable or the +.I \%/etc/localtime +file, as +.BR \%tzset (3) +would interpret them. +The obsolete tz_dsttime field of the kernel's timezone value is set +to zero. (For details on what this field used to mean, see +.BR \%settimeofday (2).) +.sp +When used in a startup script, making the +.B \%\-\-hctosys +function the first caller of +.BR \%settimeofday (2) +from boot, it will set the NTP \%'11\ minute\ mode' timescale via the +.I \%persistent_clock_is_local +kernel variable. If the Hardware Clock's timescale configuration is +changed then a reboot is required to inform the kernel. See the +discussion below, under +.BR "Automatic Hardware Clock Synchronization by the Kernel" . +.sp +This is a good function to use in one of the system startup scripts before the +file systems are mounted read/write. +.sp +This function should never be used on a running system. Jumping system time +will cause problems, such as corrupted filesystem timestamps. Also, if +something has changed the Hardware Clock, like NTP's \%'11\ minute\ mode', then +.B \%\-\-hctosys +will set the time incorrectly by including drift compensation. +.sp +Drift compensation can be inhibited by setting the drift factor in +.I /etc/adjtime +to zero. This setting will be persistent as long as the +.BR \%\-\-update\-drift " option is not used with " \%\-\-systohc +at shutdown (or anywhere else). Another way to inhibit this is by using the +.BR \%\-\-noadjfile " option when calling the " \%\-\-hctosys +function. A third method is to delete the +.IR /etc/adjtime " file." +.B Hwclock +will then default to using the UTC timescale for the Hardware Clock. If +the Hardware Clock is ticking local time it will need to be defined in +the file. This can be done by calling +.BR hwclock\ \-\-localtime\ \-\-adjust ; +when the file is not present this command will not actually +adjust the Clock, but it will create the file with local time +configured, and a drift factor of zero. +.sp +A condition under which inhibiting +.BR hwclock 's +drift correction may be desired is when dual-booting multiple operating +systems. If while this instance of Linux is stopped, another OS changes +the Hardware Clock's value, then when this instance is started again the +drift correction applied will be incorrect. +.sp +.RB "For " hwclock 's +drift correction to work properly it is imperative that nothing changes +the Hardware Clock while its Linux instance is not running. +. +.TP +.B \-\-set +Set the Hardware Clock to the time given by the +.B \-\-date +option, and update the timestamps in +.IR /etc/adjtime . +With the +.B \%\-\-update-drift +option also (re)calculate the drift factor. Try it without the option if +.BR \%\-\-set " fails. See " \%\-\-update-drift " below." +. +.TP +.B \-\-systz +This is an alternate to the +.B \%\-\-hctosys +function that does not read the Hardware Clock nor set the System Clock; +consequently there is not any drift correction. It is intended to be +used in a startup script on systems with kernels above version 2.6 where +you know the System Clock has been set from the Hardware Clock by the +kernel during boot. +.sp +It does the following things that are detailed above in the +.BR \%\-\-hctosys " function:" +.RS +.IP \(bu 2 +Corrects the System Clock timescale to UTC as needed. Only instead of +accomplishing this by setting the System Clock, +.B hwclock +simply informs the kernel and it handles the change. +.IP \(bu 2 +Sets the kernel's NTP \%'11\ minute\ mode' timescale. +.IP \(bu 2 +Sets the kernel's timezone. +.PP +The first two are only available on the first call of +.BR \%settimeofday (2) +after boot. Consequently this option only makes sense when used in a +startup script. If the Hardware Clocks timescale configuration is +changed then a reboot would be required to inform the kernel. +.RE +. +.TP +.BR \-w , \ \-\-systohc +Set the Hardware Clock from the System Clock, and update the timestamps in +.IR /etc/adjtime . +With the +.B \%\-\-update-drift +option also (re)calculate the drift factor. Try it without the option if +.BR \%\-\-systohc " fails. See " \%\-\-update-drift " below." +. +.TP +.BR \-V , \ \-\-version +Display version information and exit. +. +.TP +.BR \-h , \ \-\-help +Display help text and exit. +. +.SH OPTIONS +. +.TP +.BI \-\-adjfile= filename +.RI "Override the default " /etc/adjtime " file path." +. +.TP +.BI \%\-\-date= date_string +This option must be used with the +.B \-\-set +or +.B \%\-\-predict +functions, otherwise it is ignored. +.RS +.IP "" 4 +.B "hwclock\ \-\-set\ \-\-date='16:45'" +.IP "" 4 +.B "hwclock\ \-\-predict\ \-\-date='2525-08-14\ 07:11:05'" +.PP +The argument must be in local time, even if you keep your Hardware Clock in +UTC. See the +.B \%\-\-localtime +option. Therefore, the argument should not include any timezone information. +It also should not be a relative time like "+5 minutes", because +.BR \%hwclock 's +precision depends upon correlation between the argument's value and when the +enter key is pressed. Fractional seconds are silently dropped. This option is +capable of understanding many time and date formats, but the previous +parameters should be observed. +.RE +. +.TP +.BI \%\-\-delay= seconds +This option allows to overwrite internally used delay when set clock time. The +default is 0.5 (500ms) for rtc_cmos, for another RTC types the delay is 0. If +RTC type is impossible to determine (from sysfs) then it defaults also to 0.5 +to be backwardly compatible. +.RS +.PP +The 500ms default is based on commonly used MC146818A-compatible (x86) hardware clock. This +Hardware Clock can only be set to any integer time plus one half second. The +integer time is required because there is no interface to set or get a +fractional second. The additional half second delay is because the Hardware +Clock updates to the following second precisely 500 ms after setting the new +time. Unfortunately, this behavior is hardware specific and in same cases +another delay is required. +.RE +. +.TP +.TP +.BR \-D ", " \-\-debug +.RB Use\ \-\-verbose . +.RB The\ \%\-\-debug\ option +has been deprecated and may be repurposed or removed in a future release. +. +.TP +.B \-\-directisa +This option is meaningful for ISA compatible machines in the x86 and +x86_64 family. For other machines, it has no effect. This option tells +.B \%hwclock +to use explicit I/O instructions to access the Hardware Clock. +Without this option, +.B \%hwclock +will use the rtc device file, which it assumes to be driven by the Linux +RTC device driver. As of v2.26 it will no longer automatically use +directisa when the rtc driver is unavailable; this was causing an unsafe +condition that could allow two processes to access the Hardware Clock at +the same time. Direct hardware access from userspace should only be +used for testing, troubleshooting, and as a last resort when all other +methods fail. See the +.BR \-\-rtc " option." +. +.TP +.BI \-\-epoch= year +This option is required when using the +.BR \%\-\-setepoch \ function. +.RI "The minimum " year +value is 1900. The maximum is system dependent +.RB ( ULONG_MAX\ -\ 1 ). +. +.TP +.BR \-f , \ \-\-rtc=\fIfilename\fR +.RB "Override " \%hwclock 's +default rtc device file name. Otherwise it will +use the first one found in this order: +.in +4 +.br +.I /dev/rtc0 +.br +.I /dev/rtc +.br +.I /dev/misc/rtc +.br +.in +.RB "For " IA-64: +.in +4 +.br +.I /dev/efirtc +.br +.I /dev/misc/efirtc +.in +. +.TP +.BR \-l , \ \-\-localtime +.TQ +.BR \-u ", " \-\-utc +Indicate which timescale the Hardware Clock is set to. +.sp +The Hardware Clock may be configured to use either the UTC or the local +timescale, but nothing in the clock itself says which alternative is +being used. The +.BR \%\-\-localtime " or " \-\-utc +options give this information to the +.B \%hwclock +command. If you specify the wrong one (or specify neither and take a +wrong default), both setting and reading the Hardware Clock will be +incorrect. +.sp +If you specify neither +.BR \-\-utc " nor " \%\-\-localtime +then the one last given with a set function +.RB ( \-\-set ", " \%\-\-systohc ", or " \%\-\-adjust ), +as recorded in +.IR /etc/adjtime , +will be used. If the adjtime file doesn't exist, the default is UTC. +.sp +Note: daylight saving time changes may be inconsistent when the +Hardware Clock is kept in local time. See the discussion below, under +.BR "LOCAL vs UTC" . +. +.TP +.B \-\-noadjfile +Disable the facilities provided by +.IR /etc/adjtime . +.B \%hwclock +will not read nor write to that file with this option. Either +.BR \-\-utc " or " \%\-\-localtime +must be specified when using this option. +. +.TP +.B \-\-test +Do not actually change anything on the system, that is, the Clocks or +.I /etc/adjtime +.RB ( \%\-\-verbose +is implicit with this option). +. +.TP +.B \-\-update\-drift +Update the Hardware Clock's drift factor in +.IR /etc/adjtime . +It can only be used with +.BR \-\-set " or " \%\-\-systohc , +.sp +A minimum four hour period between settings is required. This is to +avoid invalid calculations. The longer the period, the more precise the +resulting drift factor will be. +.sp +This option was added in v2.26, because +it is typical for systems to call +.B \%hwclock\ \-\-systohc +at shutdown; with the old behaviour this would automatically +(re)calculate the drift factor which caused several problems: +.RS +.IP \(bu 2 +When using NTP with an \%'11\ minute\ mode' kernel the drift factor +would be clobbered to near zero. +.IP \(bu 2 +It would not allow the use of 'cold' drift correction. With most +configurations using 'cold' drift will yield favorable results. Cold, +means when the machine is turned off which can have a significant impact +on the drift factor. +.IP \(bu 2 +(Re)calculating drift factor on every shutdown delivers suboptimal +results. For example, if ephemeral conditions cause the machine to be +abnormally hot the drift factor calculation would be out of range. +.IP \(bu 2 +Significantly increased system shutdown times (as of v2.31 when not +using +.B \%\-\-update\-drift +the RTC is not read). +.PP +.RB "Having " \%hwclock +calculate the drift factor is a good starting point, but for optimal +results it will likely need to be adjusted by directly editing the +.I /etc/adjtime +file. For most configurations once a machine's optimal drift factor is +crafted it should not need to be changed. Therefore, the old behavior to +automatically (re)calculate drift was changed and now requires this +option to be used. See the discussion below, under +.BR "The Adjust Function" . +.PP +This option requires reading the Hardware Clock before setting it. If +it cannot be read, then this option will cause the set functions to fail. +This can happen, for example, if the Hardware Clock is corrupted by a +power failure. In that case, the clock must first be set without this +option. Despite it not working, the resulting drift correction factor +would be invalid anyway. +.RE +. +.TP +.BR \-v ", " \-\-verbose +Display more details about what +.B \%hwclock +is doing internally. +. +.SH NOTES +. +.SS Clocks in a Linux System +.PP +There are two types of date-time clocks: +.PP +.B The Hardware Clock: +This clock is an independent hardware device, with its own power domain +(battery, capacitor, etc), that operates when the machine is powered off, +or even unplugged. +.PP +On an ISA compatible system, this clock is specified as part of the ISA +standard. A control program can read or set this clock only to a whole +second, but it can also detect the edges of the 1 second clock ticks, so +the clock actually has virtually infinite precision. +.PP +This clock is commonly called the hardware clock, the real time clock, +the RTC, the BIOS clock, and the CMOS clock. Hardware Clock, in its +capitalized form, was coined for use by +.BR \%hwclock . +The Linux kernel also refers to it as the persistent clock. +.PP +Some non-ISA systems have a few real time clocks with +only one of them having its own power domain. +A very low power external I2C or SPI clock chip might be used with a +backup battery as the hardware clock to initialize a more functional +integrated real-time clock which is used for most other purposes. +.PP +.B The System Clock: +This clock is part of the Linux kernel and is driven by +a timer interrupt. (On an ISA machine, the timer interrupt is part of +the ISA standard.) It has meaning only while Linux is running on the +machine. The System Time is the number of seconds since 00:00:00 +January 1, 1970 UTC (or more succinctly, the number of seconds since +1969 UTC). The System Time is not an integer, though. It has virtually +infinite precision. +.PP +The System Time is the time that matters. The Hardware Clock's basic +purpose is to keep time when Linux is not running so that the System +Clock can be initialized from it at boot. Note that in DOS, for which +ISA was designed, the Hardware Clock is the only real time clock. +.PP +It is important that the System Time not have any discontinuities such as +would happen if you used the +.BR \%date (1) +program to set it while the system is running. You can, however, do whatever +you want to the Hardware Clock while the system is running, and the next +time Linux starts up, it will do so with the adjusted time from the Hardware +Clock. Note: currently this is not possible on most systems because +.B \%hwclock\ \-\-systohc +is called at shutdown. +.PP +The Linux kernel's timezone is set by +.BR hwclock . +But don't be misled -- almost nobody cares what timezone the kernel +thinks it is in. Instead, programs that care about the timezone +(perhaps because they want to display a local time for you) almost +always use a more traditional method of determining the timezone: They +use the TZ environment variable or the +.I \%/etc/localtime +file, as explained in the man page for +.BR \%tzset (3). +However, some programs and fringe parts of the Linux kernel such as filesystems +use the kernel's timezone value. An example is the vfat filesystem. If the +kernel timezone value is wrong, the vfat filesystem will report and set the +wrong timestamps on files. Another example is the kernel's NTP \%'11\ minute\ mode'. +If the kernel's timezone value and/or the +.I \%persistent_clock_is_local +variable are wrong, then the Hardware Clock will be set incorrectly +by \%'11\ minute\ mode'. See the discussion below, under +.BR "Automatic Hardware Clock Synchronization by the Kernel" . +.PP +.B \%hwclock +sets the kernel's timezone to the value indicated by TZ or +.IR \%/etc/localtime " with the" +.BR \%\-\-hctosys " or " \%\-\-systz " functions." +.PP +The kernel's timezone value actually consists of two parts: 1) a field +tz_minuteswest indicating how many minutes local time (not adjusted +for DST) lags behind UTC, and 2) a field tz_dsttime indicating +the type of Daylight Savings Time (DST) convention that is in effect +in the locality at the present time. +This second field is not used under Linux and is always zero. +See also +.BR \%settimeofday (2). +. +.SS Hardware Clock Access Methods +.PP +.B \%hwclock +uses many different ways to get and set Hardware Clock values. The most +normal way is to do I/O to the rtc device special file, which is +presumed to be driven by the rtc device driver. Also, Linux systems +using the rtc framework with udev, are capable of supporting multiple +Hardware Clocks. This may bring about the need to override the default +rtc device by specifying one with the +.BR \-\-rtc " option." +.PP +However, this method is not always available as older systems do not +have an rtc driver. On these systems, the method of accessing the +Hardware Clock depends on the system hardware. +.PP +On an ISA compatible system, +.B \%hwclock +can directly access the "CMOS memory" registers that +constitute the clock, by doing I/O to Ports 0x70 and 0x71. It does +this with actual I/O instructions and consequently can only do it if +running with superuser effective userid. This method may be used by +specifying the +.BR \%\-\-directisa " option." +.PP +This is a really poor method of accessing the clock, for all the +reasons that userspace programs are generally not supposed to do +direct I/O and disable interrupts. +.B \%hwclock +provides it for testing, troubleshooting, and because it may be the +only method available on ISA systems which do not have a working rtc +device driver. +.SS The Adjust Function +.PP +The Hardware Clock is usually not very accurate. However, much of its +inaccuracy is completely predictable - it gains or loses the same amount +of time every day. This is called systematic drift. +.BR \%hwclock "'s " \%\-\-adjust +function lets you apply systematic drift corrections to the +Hardware Clock. +.PP +It works like this: +.BR \%hwclock " keeps a file," +.IR /etc/adjtime , +that keeps some historical information. This is called the adjtime file. +.PP +Suppose you start with no adjtime file. You issue a +.B \%hwclock\ \-\-set +command to set the Hardware Clock to the true current time. +.B \%hwclock +creates the adjtime file and records in it the current time as the +last time the clock was calibrated. +Five days later, the clock has gained 10 seconds, so you issue a +.B \%hwclock\ \-\-set\ \-\-update\-drift +command to set it back 10 seconds. +.B \%hwclock +updates the adjtime file to show the current time as the last time the +clock was calibrated, and records 2 seconds per day as the systematic +drift rate. 24 hours go by, and then you issue a +.B \%hwclock\ \-\-adjust +command. +.B \%hwclock +consults the adjtime file and sees that the clock gains 2 seconds per +day when left alone and that it has been left alone for exactly one +day. So it subtracts 2 seconds from the Hardware Clock. It then +records the current time as the last time the clock was adjusted. +Another 24 hours go by and you issue another +.BR \%hwclock\ \-\-adjust . +.B \%hwclock +does the same thing: subtracts 2 seconds and updates the adjtime file +with the current time as the last time the clock was adjusted. +.PP +When you use the +.BR \%\-\-update\-drift " option with " \-\-set " or " \%\-\-systohc , +the systematic drift rate is (re)calculated by comparing the fully drift +corrected current Hardware Clock time with the new set time, from that +it derives the 24 hour drift rate based on the last calibrated timestamp +from the adjtime file. This updated drift factor is then saved in +.IR /etc/adjtime . +.PP +A small amount of error creeps in when +the Hardware Clock is set, so +.B \%\-\-adjust +refrains from making any adjustment that is less +than 1 second. Later on, when you request an adjustment again, the accumulated +drift will be more than 1 second and +.B \%\-\-adjust +will make the adjustment including any fractional amount. +.PP +.B \%hwclock\ \-\-hctosys +also uses the adjtime file data to compensate the value read from the Hardware +Clock before using it to set the System Clock. It does not share the 1 second +limitation of +.BR \%\-\-adjust , +and will correct sub-second drift values immediately. It does not +change the Hardware Clock time nor the adjtime file. This may eliminate +the need to use +.BR \%\-\-adjust , +unless something else on the system needs the Hardware Clock to be +compensated. +. +.SS The Adjtime File +While named for its historical purpose of controlling adjustments only, +it actually contains other information used by +.B hwclock +from one invocation to the next. +.PP +The format of the adjtime file is, in ASCII: +.PP +Line 1: Three numbers, separated by blanks: 1) the systematic drift rate +in seconds per day, floating point decimal; 2) the resulting number of +seconds since 1969 UTC of most recent adjustment or calibration, +decimal integer; 3) zero (for compatibility with +.BR \%clock (8)) +as a decimal integer. +.PP +Line 2: One number: the resulting number of seconds since 1969 UTC of most +recent calibration. Zero if there has been no calibration yet or it +is known that any previous calibration is moot (for example, because +the Hardware Clock has been found, since that calibration, not to +contain a valid time). This is a decimal integer. +.PP +Line 3: "UTC" or "LOCAL". Tells whether the Hardware Clock is set to +Coordinated Universal Time or local time. You can always override this +value with options on the +.B \%hwclock +command line. +.PP +You can use an adjtime file that was previously used with the +.BR \%clock "(8) program with " \%hwclock . +. +.SS Automatic Hardware Clock Synchronization by the Kernel +.PP +You should be aware of another way that the Hardware Clock is kept +synchronized in some systems. The Linux kernel has a mode wherein it +copies the System Time to the Hardware Clock every 11 minutes. This mode +is a compile time option, so not all kernels will have this capability. +This is a good mode to use when you are using something sophisticated +like NTP to keep your System Clock synchronized. (NTP is a way to keep +your System Time synchronized either to a time server somewhere on the +network or to a radio clock hooked up to your system. See RFC 1305.) +.PP +If the kernel is compiled with the \%'11\ minute\ mode' option it will +be active when the kernel's clock discipline is in a synchronized state. +When in this state, bit 6 (the bit that is set in the mask 0x0040) +of the kernel's +.I \%time_status +variable is unset. This value is output as the 'status' line of the +.BR \%adjtimex\ --print " or " \%ntptime " commands." +.PP +It takes an outside influence, like the NTP daemon +to put the kernel's clock discipline into a synchronized state, and +therefore turn on \%'11\ minute\ mode'. +It can be turned off by running anything that sets the System Clock the old +fashioned way, including +.BR "\%hwclock\ \-\-hctosys" . +However, if the NTP daemon is still running, it will turn \%'11\ minute\ mode' +back on again the next time it synchronizes the System Clock. +.PP +If your system runs with \%'11\ minute\ mode' on, it may need to use either +.BR \%\-\-hctosys " or " \%\-\-systz +in a startup script, especially if the Hardware Clock is configured to use +the local timescale. Unless the kernel is informed of what timescale the +Hardware Clock is using, it may clobber it with the wrong one. The kernel +uses UTC by default. +.PP +The first userspace command to set the System Clock informs the +kernel what timescale the Hardware Clock is using. This happens via the +.I \%persistent_clock_is_local +kernel variable. If +.BR \%\-\-hctosys " or " \%\-\-systz +is the first, it will set this variable according to the adjtime file or the +appropriate command-line argument. Note that when using this capability and the +Hardware Clock timescale configuration is changed, then a reboot is required to +notify the kernel. +.PP +.B \%hwclock\ \-\-adjust +should not be used with NTP \%'11\ minute\ mode'. +. +.SS ISA Hardware Clock Century value +.PP +There is some sort of standard that defines CMOS memory Byte 50 on an ISA +machine as an indicator of what century it is. +.B \%hwclock +does not use or set that byte because there are some machines that +don't define the byte that way, and it really isn't necessary anyway, +since the year-of-century does a good job of implying which century it +is. +.PP +If you have a bona fide use for a CMOS century byte, contact the +.B \%hwclock +maintainer; an option may be appropriate. +.PP +Note that this section is only relevant when you are using the "direct +ISA" method of accessing the Hardware Clock. +ACPI provides a standard way to access century values, when they +are supported by the hardware. +. +.SH DATE-TIME CONFIGURATION +.in +4 +.SS Keeping Time without External Synchronization +.in +.PP +This discussion is based on the following conditions: +.IP \(bu 2 +Nothing is running that alters the date-time clocks, such as NTP daemon or a cron job." +.IP \(bu 2 +The system timezone is configured for the correct local time. See below, under +.BR "POSIX vs 'RIGHT'" . +.IP \(bu 2 +Early during startup the following are called, in this order: +.br +.BI \%adjtimex\ \-\-tick \ value\ \-\-frequency \ value +.br +.B \%hwclock\ \-\-hctosys +.IP \(bu 2 +During shutdown the following is called: +.br +.B \%hwclock\ \-\-systohc +.PP +.in +4 +.BR * " Systems without " adjtimex " may use " ntptime . +.in +.PP +Whether maintaining precision time with NTP daemon +or not, it makes sense to configure the system to keep reasonably good +date-time on its own. +.PP +The first step in making that happen is having a clear understanding of +the big picture. There are two completely separate hardware devices +running at their own speed and drifting away from the 'correct' time at +their own rates. The methods and software for drift correction are +different for each of them. However, most systems are configured to +exchange values between these two clocks at startup and shutdown. Now +the individual device's time keeping errors are transferred back and +forth between each other. Attempt to configure drift correction for only +one of them, and the other's drift will be overlaid upon it. +.PP +This problem can be avoided when configuring drift correction for the +System Clock by simply not shutting down the machine. This, plus the +fact that all of +.BR \%hwclock 's +precision (including calculating drift factors) depends upon the System +Clock's rate being correct, means that configuration of the System Clock +should be done first. +.PP +The System Clock drift is corrected with the +.BR \%adjtimex "(8) command's " \-\-tick " and " \%\-\-frequency +options. These two work together: tick is the coarse adjustment and +frequency is the fine adjustment. (For systems that do not have an +.BR \%adjtimex " package," +.BI \%ntptime\ \-f\ ppm +may be used instead.) +.PP +Some Linux distributions attempt to automatically calculate the System +Clock drift with +.BR \%adjtimex 's +compare operation. Trying to correct one +drifting clock by using another drifting clock as a reference is akin to +a dog trying to catch its own tail. Success may happen eventually, but +great effort and frustration will likely precede it. This automation may +yield an improvement over no configuration, but expecting optimum +results would be in error. A better choice for manual configuration +would be +.BR \%adjtimex 's " \-\-log " options. +.PP +It may be more effective to simply track the System Clock drift with +.BR \%sntp ", or " \%date\ \-Ins +and a precision timepiece, and then calculate the correction manually. +.PP +After setting the tick and frequency values, continue to test and refine the +adjustments until the System Clock keeps good time. See +.BR \%adjtimex (8) +for more information and the example demonstrating manual drift +calculations. +.PP +Once the System Clock is ticking smoothly, move on to the Hardware Clock. +.PP +As a rule, cold drift will work best for most use cases. This should be +true even for 24/7 machines whose normal downtime consists of a reboot. +In that case the drift factor value makes little difference. But on the +rare occasion that the machine is shut down for an extended period, then +cold drift should yield better results. +.PP +.B Steps to calculate cold drift: +.IP 1 2 +.B "Ensure that NTP daemon will not be launched at startup." +.IP 2 2 +.RI The " System Clock " "time must be correct at shutdown!" +.IP 3 2 +Shut down the system. +.IP 4 2 +Let an extended period pass without changing the Hardware Clock. +.IP 5 2 +Start the system. +.IP 6 2 +.RB "Immediately use " hwclock " to set the correct time, adding the" +.BR \%\-\-update\-drift " option." +.PP +Note: if step 6 uses +.BR \%\-\-systohc , +then the System Clock must be set correctly (step 6a) just before doing so. +.PP +.RB "Having " hwclock +calculate the drift factor is a good starting point, but for optimal +results it will likely need to be adjusted by directly editing the +.I /etc/adjtime +file. Continue to test and refine the drift factor until the Hardware +Clock is corrected properly at startup. To check this, first make sure +that the System Time is correct before shutdown and then use +.BR \%sntp ", or " \%date\ \-Ins +and a precision timepiece, immediately after startup. +.SS LOCAL vs UTC +Keeping the Hardware Clock in a local timescale causes inconsistent +daylight saving time results: +.IP \(bu 2 +If Linux is running during a daylight saving time change, the time +written to the Hardware Clock will be adjusted for the change. +.IP \(bu 2 +If Linux is NOT running during a daylight saving time change, the time +read from the Hardware Clock will NOT be adjusted for the change. +.PP +The Hardware Clock on an ISA compatible system keeps only a date and time, +it has no concept of timezone nor daylight saving. Therefore, when +.B hwclock +is told that it is in local time, it assumes it is in the 'correct' +local time and makes no adjustments to the time read from it. +.PP +Linux handles daylight saving time changes transparently only when the +Hardware Clock is kept in the UTC timescale. Doing so is made easy for +system administrators as +.B \%hwclock +uses local time for its output and as the argument to the +.BR \%\-\-date " option." +.PP +POSIX systems, like Linux, are designed to have the System Clock operate +in the UTC timescale. The Hardware Clock's purpose is to initialize the +System Clock, so also keeping it in UTC makes sense. +.PP +Linux does, however, attempt to accommodate the Hardware Clock being in +the local timescale. This is primarily for dual-booting with older +versions of MS Windows. From Windows 7 on, the RealTimeIsUniversal +registry key is supposed to be working properly so that its Hardware +Clock can be kept in UTC. +. +.SS POSIX vs 'RIGHT' +A discussion on date-time configuration would be incomplete without +addressing timezones, this is mostly well covered by +.BR tzset (3). +One area that seems to have no documentation is the 'right' +directory of the Time Zone Database, sometimes called tz or zoneinfo. +.PP +There are two separate databases in the zoneinfo system, posix +and 'right'. 'Right' (now named zoneinfo\-leaps) includes leap seconds and posix +does not. To use the 'right' database the System Clock must be set to +\%(UTC\ +\ leap seconds), which is equivalent to \%(TAI\ \-\ 10). This +allows calculating the +exact number of seconds between two dates that cross a leap second +epoch. The System Clock is then converted to the correct civil time, +including UTC, by using the 'right' timezone files which subtract the +leap seconds. Note: this configuration is considered experimental and is +known to have issues. +.PP +To configure a system to use a particular database all of the files +located in its directory must be copied to the root of +.IR \%/usr/share/zoneinfo . +Files are never used directly from the posix or 'right' subdirectories, e.g., +.RI \%TZ=' right/Europe/Dublin '. +This habit was becoming so common that the upstream zoneinfo project +restructured the system's file tree by moving the posix and 'right' +subdirectories out of the zoneinfo directory and into sibling directories: +.PP +.in +2 +.I /usr/share/zoneinfo +.br +.I /usr/share/zoneinfo\-posix +.br +.I /usr/share/zoneinfo\-leaps +.PP +Unfortunately, some Linux distributions are changing it back to the old +tree structure in their packages. So the problem of system +administrators reaching into the 'right' subdirectory persists. This +causes the system timezone to be configured to include leap seconds +while the zoneinfo database is still configured to exclude them. Then +when an application such as a World Clock needs the South_Pole timezone +file; or an email MTA, or +.B hwclock +needs the UTC timezone file; they fetch it from the root of +.I \%/usr/share/zoneinfo +, because that is what they are supposed to do. Those files exclude leap +seconds, but the System Clock now includes them, causing an incorrect +time conversion. +.PP +Attempting to mix and match files from these separate databases will not +work, because they each require the System Clock to use a different +timescale. The zoneinfo database must be configured to use either posix +or 'right', as described above, or by assigning a database path to the +.SB TZDIR +environment variable. +.SH EXIT STATUS +One of the following exit values will be returned: +.TP +.BR EXIT_SUCCESS " ('0' on POSIX systems)" +Successful program execution. +.TP +.BR EXIT_FAILURE " ('1' on POSIX systems)" +The operation failed or the command syntax was not valid. +.SH ENVIRONMENT +.TP +.B TZ +If this variable is set its value takes precedence over the system +configured timezone. +.TP +.B TZDIR +If this variable is set its value takes precedence over the system +configured timezone database directory path. +.SH FILES +.TP +.I /etc/adjtime +The configuration and state file for hwclock. +.TP +.I /etc/localtime +The system timezone file. +.TP +.I /usr/share/zoneinfo/ +The system timezone database directory. +.PP +Device files +.B hwclock +may try for Hardware Clock access: +.br +.I /dev/rtc0 +.br +.I /dev/rtc +.br +.I /dev/misc/rtc +.br +.I /dev/efirtc +.br +.I /dev/misc/efirtc +.SH "SEE ALSO" +.BR date (1), +.BR adjtimex (8), +.BR gettimeofday (2), +.BR settimeofday (2), +.BR crontab (1), +.BR tzset (3) +. +.SH AUTHORS +Written by Bryan Henderson, September 1996 (bryanh@giraffe-data.com), +based on work done on the +.BR \%clock (8) +program by Charles Hedrick, Rob Hooft, and Harald Koenig. +See the source code for complete history and credits. +. +.SH AVAILABILITY +The hwclock command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/hwclock.8.in b/sys-utils/hwclock.8.in new file mode 100644 index 0000000..dacdd27 --- /dev/null +++ b/sys-utils/hwclock.8.in @@ -0,0 +1,998 @@ +.\" hwclock.8.in -- man page for util-linux' hwclock +.\" +.\" 2015-01-07 J William Piggott +.\" Authored new section: DATE-TIME CONFIGURATION. +.\" Subsections: Keeping Time..., LOCAL vs UTC, POSIX vs 'RIGHT'. +.\" +.TH HWCLOCK 8 "July 2017" "util-linux" "System Administration" +.SH NAME +hwclock \- time clocks utility +.SH SYNOPSIS +.B hwclock +.RI [ function ] +.RI [ option ...] +. +.SH DESCRIPTION +.B hwclock +is an administration tool for the time clocks. It can: display the +Hardware Clock time; set the Hardware Clock to a specified time; set the +Hardware Clock from the System Clock; set the System Clock from the +Hardware Clock; compensate for Hardware Clock drift; correct the System +Clock timescale; set the kernel's timezone, NTP timescale, and epoch +(Alpha only); and predict future +Hardware Clock values based on its drift rate. +.PP +Since v2.26 important changes were made to the +.B \-\-hctosys +function and the +.B \-\-directisa +option, and a new option +.B \-\-update\-drift +was added. See their respective descriptions below. +. +.SH FUNCTIONS +The following functions are mutually exclusive, only one can be given at +a time. If none is given, the default is \fB\-\-show\fR. +.TP +.B \-a, \-\-adjust +Add or subtract time from the Hardware Clock to account for systematic +drift since the last time the clock was set or adjusted. See the +discussion below, under +.BR "The Adjust Function" . +. +.TP +.B \-\-getepoch +.TQ +.B \-\-setepoch +These functions are for Alpha machines only, and are only available +through the Linux kernel RTC driver. +.sp +They are used to read and set the kernel's Hardware Clock epoch value. +Epoch is the number of years into AD to which a zero year value in the +Hardware Clock refers. For example, if the machine's BIOS sets the year +counter in the Hardware Clock to contain the number of full years since +1952, then the kernel's Hardware Clock epoch value must be 1952. +.sp +The \fB\%\-\-setepoch\fR function requires using the +.B \%\-\-epoch +option to specify the year. For example: +.RS +.IP "" 4 +.B hwclock\ \-\-setepoch\ \-\-epoch=1952 +.PP +The RTC driver attempts to guess the correct epoch value, so setting it +may not be required. +.PP +This epoch value is used whenever +.B \%hwclock +reads or sets the Hardware Clock on an Alpha machine. For ISA machines +the kernel uses the fixed Hardware Clock epoch of 1900. +.RE +. +.TP +.B \-\-predict +Predict what the Hardware Clock will read in the future based upon the +time given by the +.B \-\-date +option and the information in +.IR @ADJTIME_PATH@ . +This is useful, for example, to account for drift when setting a +Hardware Clock wakeup (aka alarm). See +.BR \%rtcwake (8). +.sp +Do not use this function if the Hardware Clock is being modified by +anything other than the current operating system's +.B \%hwclock +command, such as \%'11\ minute\ mode' or from dual-booting another OS. +. +.TP +.BR \-r , \ \-\-show +.TQ +.B \-\-get +.br +Read the Hardware Clock and print its time to standard output in the +.B ISO 8601 +format. +The time shown is always in local time, even if you keep your Hardware Clock +in UTC. See the +.B \%\-\-localtime +option. +.sp +Showing the Hardware Clock time is the default when no function is specified. +.sp +The +.B \-\-get +function also applies drift correction to the time read, based upon the +information in +.IR @ADJTIME_PATH@ . +Do not use this function if the Hardware Clock is being modified by +anything other than the current operating system's +.B \%hwclock +command, such as \%'11\ minute\ mode' or from dual-booting another OS. +. +.TP +.BR \-s , \ \-\-hctosys +Set the System Clock from the Hardware Clock. The time read from the Hardware +Clock is compensated to account for systematic drift before using it to set the +System Clock. See the discussion below, under +.BR "The Adjust Function" . +.sp +The System Clock must be kept in the UTC timescale for date-time +applications to work correctly in conjunction with the timezone configured +for the system. If the Hardware Clock is kept in local time then the time read +from it must be shifted to the UTC timescale before using it to set the System +Clock. The +.B \%\-\-hctosys +function does this based upon the information in the +.I @ADJTIME_PATH@ +file or the command line arguments +.BR \%\-\-localtime " and " \-\-utc . +Note: no daylight saving adjustment is made. See the discussion below, under +.BR "LOCAL vs UTC" . +.sp +The kernel also keeps a timezone value, the +.B \%\-\-hctosys +function sets it to the timezone configured for the system. The system +timezone is configured by the TZ environment variable or the +.I \%/etc/localtime +file, as +.BR \%tzset (3) +would interpret them. +The obsolete tz_dsttime field of the kernel's timezone value is set +to zero. (For details on what this field used to mean, see +.BR \%settimeofday (2).) +.sp +When used in a startup script, making the +.B \%\-\-hctosys +function the first caller of +.BR \%settimeofday (2) +from boot, it will set the NTP \%'11\ minute\ mode' timescale via the +.I \%persistent_clock_is_local +kernel variable. If the Hardware Clock's timescale configuration is +changed then a reboot is required to inform the kernel. See the +discussion below, under +.BR "Automatic Hardware Clock Synchronization by the Kernel" . +.sp +This is a good function to use in one of the system startup scripts before the +file systems are mounted read/write. +.sp +This function should never be used on a running system. Jumping system time +will cause problems, such as corrupted filesystem timestamps. Also, if +something has changed the Hardware Clock, like NTP's \%'11\ minute\ mode', then +.B \%\-\-hctosys +will set the time incorrectly by including drift compensation. +.sp +Drift compensation can be inhibited by setting the drift factor in +.I @ADJTIME_PATH@ +to zero. This setting will be persistent as long as the +.BR \%\-\-update\-drift " option is not used with " \%\-\-systohc +at shutdown (or anywhere else). Another way to inhibit this is by using the +.BR \%\-\-noadjfile " option when calling the " \%\-\-hctosys +function. A third method is to delete the +.IR @ADJTIME_PATH@ " file." +.B Hwclock +will then default to using the UTC timescale for the Hardware Clock. If +the Hardware Clock is ticking local time it will need to be defined in +the file. This can be done by calling +.BR hwclock\ \-\-localtime\ \-\-adjust ; +when the file is not present this command will not actually +adjust the Clock, but it will create the file with local time +configured, and a drift factor of zero. +.sp +A condition under which inhibiting +.BR hwclock 's +drift correction may be desired is when dual-booting multiple operating +systems. If while this instance of Linux is stopped, another OS changes +the Hardware Clock's value, then when this instance is started again the +drift correction applied will be incorrect. +.sp +.RB "For " hwclock 's +drift correction to work properly it is imperative that nothing changes +the Hardware Clock while its Linux instance is not running. +. +.TP +.B \-\-set +Set the Hardware Clock to the time given by the +.B \-\-date +option, and update the timestamps in +.IR @ADJTIME_PATH@ . +With the +.B \%\-\-update-drift +option also (re)calculate the drift factor. Try it without the option if +.BR \%\-\-set " fails. See " \%\-\-update-drift " below." +. +.TP +.B \-\-systz +This is an alternate to the +.B \%\-\-hctosys +function that does not read the Hardware Clock nor set the System Clock; +consequently there is not any drift correction. It is intended to be +used in a startup script on systems with kernels above version 2.6 where +you know the System Clock has been set from the Hardware Clock by the +kernel during boot. +.sp +It does the following things that are detailed above in the +.BR \%\-\-hctosys " function:" +.RS +.IP \(bu 2 +Corrects the System Clock timescale to UTC as needed. Only instead of +accomplishing this by setting the System Clock, +.B hwclock +simply informs the kernel and it handles the change. +.IP \(bu 2 +Sets the kernel's NTP \%'11\ minute\ mode' timescale. +.IP \(bu 2 +Sets the kernel's timezone. +.PP +The first two are only available on the first call of +.BR \%settimeofday (2) +after boot. Consequently this option only makes sense when used in a +startup script. If the Hardware Clocks timescale configuration is +changed then a reboot would be required to inform the kernel. +.RE +. +.TP +.BR \-w , \ \-\-systohc +Set the Hardware Clock from the System Clock, and update the timestamps in +.IR @ADJTIME_PATH@ . +With the +.B \%\-\-update-drift +option also (re)calculate the drift factor. Try it without the option if +.BR \%\-\-systohc " fails. See " \%\-\-update-drift " below." +. +.TP +.BR \-V , \ \-\-version +Display version information and exit. +. +.TP +.BR \-h , \ \-\-help +Display help text and exit. +. +.SH OPTIONS +. +.TP +.BI \-\-adjfile= filename +.RI "Override the default " @ADJTIME_PATH@ " file path." +. +.TP +.BI \%\-\-date= date_string +This option must be used with the +.B \-\-set +or +.B \%\-\-predict +functions, otherwise it is ignored. +.RS +.IP "" 4 +.B "hwclock\ \-\-set\ \-\-date='16:45'" +.IP "" 4 +.B "hwclock\ \-\-predict\ \-\-date='2525-08-14\ 07:11:05'" +.PP +The argument must be in local time, even if you keep your Hardware Clock in +UTC. See the +.B \%\-\-localtime +option. Therefore, the argument should not include any timezone information. +It also should not be a relative time like "+5 minutes", because +.BR \%hwclock 's +precision depends upon correlation between the argument's value and when the +enter key is pressed. Fractional seconds are silently dropped. This option is +capable of understanding many time and date formats, but the previous +parameters should be observed. +.RE +. +.TP +.BI \%\-\-delay= seconds +This option allows to overwrite internally used delay when set clock time. The +default is 0.5 (500ms) for rtc_cmos, for another RTC types the delay is 0. If +RTC type is impossible to determine (from sysfs) then it defaults also to 0.5 +to be backwardly compatible. +.RS +.PP +The 500ms default is based on commonly used MC146818A-compatible (x86) hardware clock. This +Hardware Clock can only be set to any integer time plus one half second. The +integer time is required because there is no interface to set or get a +fractional second. The additional half second delay is because the Hardware +Clock updates to the following second precisely 500 ms after setting the new +time. Unfortunately, this behavior is hardware specific and in same cases +another delay is required. +.RE +. +.TP +.TP +.BR \-D ", " \-\-debug +.RB Use\ \-\-verbose . +.RB The\ \%\-\-debug\ option +has been deprecated and may be repurposed or removed in a future release. +. +.TP +.B \-\-directisa +This option is meaningful for ISA compatible machines in the x86 and +x86_64 family. For other machines, it has no effect. This option tells +.B \%hwclock +to use explicit I/O instructions to access the Hardware Clock. +Without this option, +.B \%hwclock +will use the rtc device file, which it assumes to be driven by the Linux +RTC device driver. As of v2.26 it will no longer automatically use +directisa when the rtc driver is unavailable; this was causing an unsafe +condition that could allow two processes to access the Hardware Clock at +the same time. Direct hardware access from userspace should only be +used for testing, troubleshooting, and as a last resort when all other +methods fail. See the +.BR \-\-rtc " option." +. +.TP +.BI \-\-epoch= year +This option is required when using the +.BR \%\-\-setepoch \ function. +.RI "The minimum " year +value is 1900. The maximum is system dependent +.RB ( ULONG_MAX\ -\ 1 ). +. +.TP +.BR \-f , \ \-\-rtc=\fIfilename\fR +.RB "Override " \%hwclock 's +default rtc device file name. Otherwise it will +use the first one found in this order: +.in +4 +.br +.I /dev/rtc0 +.br +.I /dev/rtc +.br +.I /dev/misc/rtc +.br +.in +.RB "For " IA-64: +.in +4 +.br +.I /dev/efirtc +.br +.I /dev/misc/efirtc +.in +. +.TP +.BR \-l , \ \-\-localtime +.TQ +.BR \-u ", " \-\-utc +Indicate which timescale the Hardware Clock is set to. +.sp +The Hardware Clock may be configured to use either the UTC or the local +timescale, but nothing in the clock itself says which alternative is +being used. The +.BR \%\-\-localtime " or " \-\-utc +options give this information to the +.B \%hwclock +command. If you specify the wrong one (or specify neither and take a +wrong default), both setting and reading the Hardware Clock will be +incorrect. +.sp +If you specify neither +.BR \-\-utc " nor " \%\-\-localtime +then the one last given with a set function +.RB ( \-\-set ", " \%\-\-systohc ", or " \%\-\-adjust ), +as recorded in +.IR @ADJTIME_PATH@ , +will be used. If the adjtime file doesn't exist, the default is UTC. +.sp +Note: daylight saving time changes may be inconsistent when the +Hardware Clock is kept in local time. See the discussion below, under +.BR "LOCAL vs UTC" . +. +.TP +.B \-\-noadjfile +Disable the facilities provided by +.IR @ADJTIME_PATH@ . +.B \%hwclock +will not read nor write to that file with this option. Either +.BR \-\-utc " or " \%\-\-localtime +must be specified when using this option. +. +.TP +.B \-\-test +Do not actually change anything on the system, that is, the Clocks or +.I @ADJTIME_PATH@ +.RB ( \%\-\-verbose +is implicit with this option). +. +.TP +.B \-\-update\-drift +Update the Hardware Clock's drift factor in +.IR @ADJTIME_PATH@ . +It can only be used with +.BR \-\-set " or " \%\-\-systohc , +.sp +A minimum four hour period between settings is required. This is to +avoid invalid calculations. The longer the period, the more precise the +resulting drift factor will be. +.sp +This option was added in v2.26, because +it is typical for systems to call +.B \%hwclock\ \-\-systohc +at shutdown; with the old behaviour this would automatically +(re)calculate the drift factor which caused several problems: +.RS +.IP \(bu 2 +When using NTP with an \%'11\ minute\ mode' kernel the drift factor +would be clobbered to near zero. +.IP \(bu 2 +It would not allow the use of 'cold' drift correction. With most +configurations using 'cold' drift will yield favorable results. Cold, +means when the machine is turned off which can have a significant impact +on the drift factor. +.IP \(bu 2 +(Re)calculating drift factor on every shutdown delivers suboptimal +results. For example, if ephemeral conditions cause the machine to be +abnormally hot the drift factor calculation would be out of range. +.IP \(bu 2 +Significantly increased system shutdown times (as of v2.31 when not +using +.B \%\-\-update\-drift +the RTC is not read). +.PP +.RB "Having " \%hwclock +calculate the drift factor is a good starting point, but for optimal +results it will likely need to be adjusted by directly editing the +.I @ADJTIME_PATH@ +file. For most configurations once a machine's optimal drift factor is +crafted it should not need to be changed. Therefore, the old behavior to +automatically (re)calculate drift was changed and now requires this +option to be used. See the discussion below, under +.BR "The Adjust Function" . +.PP +This option requires reading the Hardware Clock before setting it. If +it cannot be read, then this option will cause the set functions to fail. +This can happen, for example, if the Hardware Clock is corrupted by a +power failure. In that case, the clock must first be set without this +option. Despite it not working, the resulting drift correction factor +would be invalid anyway. +.RE +. +.TP +.BR \-v ", " \-\-verbose +Display more details about what +.B \%hwclock +is doing internally. +. +.SH NOTES +. +.SS Clocks in a Linux System +.PP +There are two types of date-time clocks: +.PP +.B The Hardware Clock: +This clock is an independent hardware device, with its own power domain +(battery, capacitor, etc), that operates when the machine is powered off, +or even unplugged. +.PP +On an ISA compatible system, this clock is specified as part of the ISA +standard. A control program can read or set this clock only to a whole +second, but it can also detect the edges of the 1 second clock ticks, so +the clock actually has virtually infinite precision. +.PP +This clock is commonly called the hardware clock, the real time clock, +the RTC, the BIOS clock, and the CMOS clock. Hardware Clock, in its +capitalized form, was coined for use by +.BR \%hwclock . +The Linux kernel also refers to it as the persistent clock. +.PP +Some non-ISA systems have a few real time clocks with +only one of them having its own power domain. +A very low power external I2C or SPI clock chip might be used with a +backup battery as the hardware clock to initialize a more functional +integrated real-time clock which is used for most other purposes. +.PP +.B The System Clock: +This clock is part of the Linux kernel and is driven by +a timer interrupt. (On an ISA machine, the timer interrupt is part of +the ISA standard.) It has meaning only while Linux is running on the +machine. The System Time is the number of seconds since 00:00:00 +January 1, 1970 UTC (or more succinctly, the number of seconds since +1969 UTC). The System Time is not an integer, though. It has virtually +infinite precision. +.PP +The System Time is the time that matters. The Hardware Clock's basic +purpose is to keep time when Linux is not running so that the System +Clock can be initialized from it at boot. Note that in DOS, for which +ISA was designed, the Hardware Clock is the only real time clock. +.PP +It is important that the System Time not have any discontinuities such as +would happen if you used the +.BR \%date (1) +program to set it while the system is running. You can, however, do whatever +you want to the Hardware Clock while the system is running, and the next +time Linux starts up, it will do so with the adjusted time from the Hardware +Clock. Note: currently this is not possible on most systems because +.B \%hwclock\ \-\-systohc +is called at shutdown. +.PP +The Linux kernel's timezone is set by +.BR hwclock . +But don't be misled -- almost nobody cares what timezone the kernel +thinks it is in. Instead, programs that care about the timezone +(perhaps because they want to display a local time for you) almost +always use a more traditional method of determining the timezone: They +use the TZ environment variable or the +.I \%/etc/localtime +file, as explained in the man page for +.BR \%tzset (3). +However, some programs and fringe parts of the Linux kernel such as filesystems +use the kernel's timezone value. An example is the vfat filesystem. If the +kernel timezone value is wrong, the vfat filesystem will report and set the +wrong timestamps on files. Another example is the kernel's NTP \%'11\ minute\ mode'. +If the kernel's timezone value and/or the +.I \%persistent_clock_is_local +variable are wrong, then the Hardware Clock will be set incorrectly +by \%'11\ minute\ mode'. See the discussion below, under +.BR "Automatic Hardware Clock Synchronization by the Kernel" . +.PP +.B \%hwclock +sets the kernel's timezone to the value indicated by TZ or +.IR \%/etc/localtime " with the" +.BR \%\-\-hctosys " or " \%\-\-systz " functions." +.PP +The kernel's timezone value actually consists of two parts: 1) a field +tz_minuteswest indicating how many minutes local time (not adjusted +for DST) lags behind UTC, and 2) a field tz_dsttime indicating +the type of Daylight Savings Time (DST) convention that is in effect +in the locality at the present time. +This second field is not used under Linux and is always zero. +See also +.BR \%settimeofday (2). +. +.SS Hardware Clock Access Methods +.PP +.B \%hwclock +uses many different ways to get and set Hardware Clock values. The most +normal way is to do I/O to the rtc device special file, which is +presumed to be driven by the rtc device driver. Also, Linux systems +using the rtc framework with udev, are capable of supporting multiple +Hardware Clocks. This may bring about the need to override the default +rtc device by specifying one with the +.BR \-\-rtc " option." +.PP +However, this method is not always available as older systems do not +have an rtc driver. On these systems, the method of accessing the +Hardware Clock depends on the system hardware. +.PP +On an ISA compatible system, +.B \%hwclock +can directly access the "CMOS memory" registers that +constitute the clock, by doing I/O to Ports 0x70 and 0x71. It does +this with actual I/O instructions and consequently can only do it if +running with superuser effective userid. This method may be used by +specifying the +.BR \%\-\-directisa " option." +.PP +This is a really poor method of accessing the clock, for all the +reasons that userspace programs are generally not supposed to do +direct I/O and disable interrupts. +.B \%hwclock +provides it for testing, troubleshooting, and because it may be the +only method available on ISA systems which do not have a working rtc +device driver. +.SS The Adjust Function +.PP +The Hardware Clock is usually not very accurate. However, much of its +inaccuracy is completely predictable - it gains or loses the same amount +of time every day. This is called systematic drift. +.BR \%hwclock "'s " \%\-\-adjust +function lets you apply systematic drift corrections to the +Hardware Clock. +.PP +It works like this: +.BR \%hwclock " keeps a file," +.IR @ADJTIME_PATH@ , +that keeps some historical information. This is called the adjtime file. +.PP +Suppose you start with no adjtime file. You issue a +.B \%hwclock\ \-\-set +command to set the Hardware Clock to the true current time. +.B \%hwclock +creates the adjtime file and records in it the current time as the +last time the clock was calibrated. +Five days later, the clock has gained 10 seconds, so you issue a +.B \%hwclock\ \-\-set\ \-\-update\-drift +command to set it back 10 seconds. +.B \%hwclock +updates the adjtime file to show the current time as the last time the +clock was calibrated, and records 2 seconds per day as the systematic +drift rate. 24 hours go by, and then you issue a +.B \%hwclock\ \-\-adjust +command. +.B \%hwclock +consults the adjtime file and sees that the clock gains 2 seconds per +day when left alone and that it has been left alone for exactly one +day. So it subtracts 2 seconds from the Hardware Clock. It then +records the current time as the last time the clock was adjusted. +Another 24 hours go by and you issue another +.BR \%hwclock\ \-\-adjust . +.B \%hwclock +does the same thing: subtracts 2 seconds and updates the adjtime file +with the current time as the last time the clock was adjusted. +.PP +When you use the +.BR \%\-\-update\-drift " option with " \-\-set " or " \%\-\-systohc , +the systematic drift rate is (re)calculated by comparing the fully drift +corrected current Hardware Clock time with the new set time, from that +it derives the 24 hour drift rate based on the last calibrated timestamp +from the adjtime file. This updated drift factor is then saved in +.IR @ADJTIME_PATH@ . +.PP +A small amount of error creeps in when +the Hardware Clock is set, so +.B \%\-\-adjust +refrains from making any adjustment that is less +than 1 second. Later on, when you request an adjustment again, the accumulated +drift will be more than 1 second and +.B \%\-\-adjust +will make the adjustment including any fractional amount. +.PP +.B \%hwclock\ \-\-hctosys +also uses the adjtime file data to compensate the value read from the Hardware +Clock before using it to set the System Clock. It does not share the 1 second +limitation of +.BR \%\-\-adjust , +and will correct sub-second drift values immediately. It does not +change the Hardware Clock time nor the adjtime file. This may eliminate +the need to use +.BR \%\-\-adjust , +unless something else on the system needs the Hardware Clock to be +compensated. +. +.SS The Adjtime File +While named for its historical purpose of controlling adjustments only, +it actually contains other information used by +.B hwclock +from one invocation to the next. +.PP +The format of the adjtime file is, in ASCII: +.PP +Line 1: Three numbers, separated by blanks: 1) the systematic drift rate +in seconds per day, floating point decimal; 2) the resulting number of +seconds since 1969 UTC of most recent adjustment or calibration, +decimal integer; 3) zero (for compatibility with +.BR \%clock (8)) +as a decimal integer. +.PP +Line 2: One number: the resulting number of seconds since 1969 UTC of most +recent calibration. Zero if there has been no calibration yet or it +is known that any previous calibration is moot (for example, because +the Hardware Clock has been found, since that calibration, not to +contain a valid time). This is a decimal integer. +.PP +Line 3: "UTC" or "LOCAL". Tells whether the Hardware Clock is set to +Coordinated Universal Time or local time. You can always override this +value with options on the +.B \%hwclock +command line. +.PP +You can use an adjtime file that was previously used with the +.BR \%clock "(8) program with " \%hwclock . +. +.SS Automatic Hardware Clock Synchronization by the Kernel +.PP +You should be aware of another way that the Hardware Clock is kept +synchronized in some systems. The Linux kernel has a mode wherein it +copies the System Time to the Hardware Clock every 11 minutes. This mode +is a compile time option, so not all kernels will have this capability. +This is a good mode to use when you are using something sophisticated +like NTP to keep your System Clock synchronized. (NTP is a way to keep +your System Time synchronized either to a time server somewhere on the +network or to a radio clock hooked up to your system. See RFC 1305.) +.PP +If the kernel is compiled with the \%'11\ minute\ mode' option it will +be active when the kernel's clock discipline is in a synchronized state. +When in this state, bit 6 (the bit that is set in the mask 0x0040) +of the kernel's +.I \%time_status +variable is unset. This value is output as the 'status' line of the +.BR \%adjtimex\ --print " or " \%ntptime " commands." +.PP +It takes an outside influence, like the NTP daemon +to put the kernel's clock discipline into a synchronized state, and +therefore turn on \%'11\ minute\ mode'. +It can be turned off by running anything that sets the System Clock the old +fashioned way, including +.BR "\%hwclock\ \-\-hctosys" . +However, if the NTP daemon is still running, it will turn \%'11\ minute\ mode' +back on again the next time it synchronizes the System Clock. +.PP +If your system runs with \%'11\ minute\ mode' on, it may need to use either +.BR \%\-\-hctosys " or " \%\-\-systz +in a startup script, especially if the Hardware Clock is configured to use +the local timescale. Unless the kernel is informed of what timescale the +Hardware Clock is using, it may clobber it with the wrong one. The kernel +uses UTC by default. +.PP +The first userspace command to set the System Clock informs the +kernel what timescale the Hardware Clock is using. This happens via the +.I \%persistent_clock_is_local +kernel variable. If +.BR \%\-\-hctosys " or " \%\-\-systz +is the first, it will set this variable according to the adjtime file or the +appropriate command-line argument. Note that when using this capability and the +Hardware Clock timescale configuration is changed, then a reboot is required to +notify the kernel. +.PP +.B \%hwclock\ \-\-adjust +should not be used with NTP \%'11\ minute\ mode'. +. +.SS ISA Hardware Clock Century value +.PP +There is some sort of standard that defines CMOS memory Byte 50 on an ISA +machine as an indicator of what century it is. +.B \%hwclock +does not use or set that byte because there are some machines that +don't define the byte that way, and it really isn't necessary anyway, +since the year-of-century does a good job of implying which century it +is. +.PP +If you have a bona fide use for a CMOS century byte, contact the +.B \%hwclock +maintainer; an option may be appropriate. +.PP +Note that this section is only relevant when you are using the "direct +ISA" method of accessing the Hardware Clock. +ACPI provides a standard way to access century values, when they +are supported by the hardware. +. +.SH DATE-TIME CONFIGURATION +.in +4 +.SS Keeping Time without External Synchronization +.in +.PP +This discussion is based on the following conditions: +.IP \(bu 2 +Nothing is running that alters the date-time clocks, such as NTP daemon or a cron job." +.IP \(bu 2 +The system timezone is configured for the correct local time. See below, under +.BR "POSIX vs 'RIGHT'" . +.IP \(bu 2 +Early during startup the following are called, in this order: +.br +.BI \%adjtimex\ \-\-tick \ value\ \-\-frequency \ value +.br +.B \%hwclock\ \-\-hctosys +.IP \(bu 2 +During shutdown the following is called: +.br +.B \%hwclock\ \-\-systohc +.PP +.in +4 +.BR * " Systems without " adjtimex " may use " ntptime . +.in +.PP +Whether maintaining precision time with NTP daemon +or not, it makes sense to configure the system to keep reasonably good +date-time on its own. +.PP +The first step in making that happen is having a clear understanding of +the big picture. There are two completely separate hardware devices +running at their own speed and drifting away from the 'correct' time at +their own rates. The methods and software for drift correction are +different for each of them. However, most systems are configured to +exchange values between these two clocks at startup and shutdown. Now +the individual device's time keeping errors are transferred back and +forth between each other. Attempt to configure drift correction for only +one of them, and the other's drift will be overlaid upon it. +.PP +This problem can be avoided when configuring drift correction for the +System Clock by simply not shutting down the machine. This, plus the +fact that all of +.BR \%hwclock 's +precision (including calculating drift factors) depends upon the System +Clock's rate being correct, means that configuration of the System Clock +should be done first. +.PP +The System Clock drift is corrected with the +.BR \%adjtimex "(8) command's " \-\-tick " and " \%\-\-frequency +options. These two work together: tick is the coarse adjustment and +frequency is the fine adjustment. (For systems that do not have an +.BR \%adjtimex " package," +.BI \%ntptime\ \-f\ ppm +may be used instead.) +.PP +Some Linux distributions attempt to automatically calculate the System +Clock drift with +.BR \%adjtimex 's +compare operation. Trying to correct one +drifting clock by using another drifting clock as a reference is akin to +a dog trying to catch its own tail. Success may happen eventually, but +great effort and frustration will likely precede it. This automation may +yield an improvement over no configuration, but expecting optimum +results would be in error. A better choice for manual configuration +would be +.BR \%adjtimex 's " \-\-log " options. +.PP +It may be more effective to simply track the System Clock drift with +.BR \%sntp ", or " \%date\ \-Ins +and a precision timepiece, and then calculate the correction manually. +.PP +After setting the tick and frequency values, continue to test and refine the +adjustments until the System Clock keeps good time. See +.BR \%adjtimex (8) +for more information and the example demonstrating manual drift +calculations. +.PP +Once the System Clock is ticking smoothly, move on to the Hardware Clock. +.PP +As a rule, cold drift will work best for most use cases. This should be +true even for 24/7 machines whose normal downtime consists of a reboot. +In that case the drift factor value makes little difference. But on the +rare occasion that the machine is shut down for an extended period, then +cold drift should yield better results. +.PP +.B Steps to calculate cold drift: +.IP 1 2 +.B "Ensure that NTP daemon will not be launched at startup." +.IP 2 2 +.RI The " System Clock " "time must be correct at shutdown!" +.IP 3 2 +Shut down the system. +.IP 4 2 +Let an extended period pass without changing the Hardware Clock. +.IP 5 2 +Start the system. +.IP 6 2 +.RB "Immediately use " hwclock " to set the correct time, adding the" +.BR \%\-\-update\-drift " option." +.PP +Note: if step 6 uses +.BR \%\-\-systohc , +then the System Clock must be set correctly (step 6a) just before doing so. +.PP +.RB "Having " hwclock +calculate the drift factor is a good starting point, but for optimal +results it will likely need to be adjusted by directly editing the +.I @ADJTIME_PATH@ +file. Continue to test and refine the drift factor until the Hardware +Clock is corrected properly at startup. To check this, first make sure +that the System Time is correct before shutdown and then use +.BR \%sntp ", or " \%date\ \-Ins +and a precision timepiece, immediately after startup. +.SS LOCAL vs UTC +Keeping the Hardware Clock in a local timescale causes inconsistent +daylight saving time results: +.IP \(bu 2 +If Linux is running during a daylight saving time change, the time +written to the Hardware Clock will be adjusted for the change. +.IP \(bu 2 +If Linux is NOT running during a daylight saving time change, the time +read from the Hardware Clock will NOT be adjusted for the change. +.PP +The Hardware Clock on an ISA compatible system keeps only a date and time, +it has no concept of timezone nor daylight saving. Therefore, when +.B hwclock +is told that it is in local time, it assumes it is in the 'correct' +local time and makes no adjustments to the time read from it. +.PP +Linux handles daylight saving time changes transparently only when the +Hardware Clock is kept in the UTC timescale. Doing so is made easy for +system administrators as +.B \%hwclock +uses local time for its output and as the argument to the +.BR \%\-\-date " option." +.PP +POSIX systems, like Linux, are designed to have the System Clock operate +in the UTC timescale. The Hardware Clock's purpose is to initialize the +System Clock, so also keeping it in UTC makes sense. +.PP +Linux does, however, attempt to accommodate the Hardware Clock being in +the local timescale. This is primarily for dual-booting with older +versions of MS Windows. From Windows 7 on, the RealTimeIsUniversal +registry key is supposed to be working properly so that its Hardware +Clock can be kept in UTC. +. +.SS POSIX vs 'RIGHT' +A discussion on date-time configuration would be incomplete without +addressing timezones, this is mostly well covered by +.BR tzset (3). +One area that seems to have no documentation is the 'right' +directory of the Time Zone Database, sometimes called tz or zoneinfo. +.PP +There are two separate databases in the zoneinfo system, posix +and 'right'. 'Right' (now named zoneinfo\-leaps) includes leap seconds and posix +does not. To use the 'right' database the System Clock must be set to +\%(UTC\ +\ leap seconds), which is equivalent to \%(TAI\ \-\ 10). This +allows calculating the +exact number of seconds between two dates that cross a leap second +epoch. The System Clock is then converted to the correct civil time, +including UTC, by using the 'right' timezone files which subtract the +leap seconds. Note: this configuration is considered experimental and is +known to have issues. +.PP +To configure a system to use a particular database all of the files +located in its directory must be copied to the root of +.IR \%/usr/share/zoneinfo . +Files are never used directly from the posix or 'right' subdirectories, e.g., +.RI \%TZ=' right/Europe/Dublin '. +This habit was becoming so common that the upstream zoneinfo project +restructured the system's file tree by moving the posix and 'right' +subdirectories out of the zoneinfo directory and into sibling directories: +.PP +.in +2 +.I /usr/share/zoneinfo +.br +.I /usr/share/zoneinfo\-posix +.br +.I /usr/share/zoneinfo\-leaps +.PP +Unfortunately, some Linux distributions are changing it back to the old +tree structure in their packages. So the problem of system +administrators reaching into the 'right' subdirectory persists. This +causes the system timezone to be configured to include leap seconds +while the zoneinfo database is still configured to exclude them. Then +when an application such as a World Clock needs the South_Pole timezone +file; or an email MTA, or +.B hwclock +needs the UTC timezone file; they fetch it from the root of +.I \%/usr/share/zoneinfo +, because that is what they are supposed to do. Those files exclude leap +seconds, but the System Clock now includes them, causing an incorrect +time conversion. +.PP +Attempting to mix and match files from these separate databases will not +work, because they each require the System Clock to use a different +timescale. The zoneinfo database must be configured to use either posix +or 'right', as described above, or by assigning a database path to the +.SB TZDIR +environment variable. +.SH EXIT STATUS +One of the following exit values will be returned: +.TP +.BR EXIT_SUCCESS " ('0' on POSIX systems)" +Successful program execution. +.TP +.BR EXIT_FAILURE " ('1' on POSIX systems)" +The operation failed or the command syntax was not valid. +.SH ENVIRONMENT +.TP +.B TZ +If this variable is set its value takes precedence over the system +configured timezone. +.TP +.B TZDIR +If this variable is set its value takes precedence over the system +configured timezone database directory path. +.SH FILES +.TP +.I @ADJTIME_PATH@ +The configuration and state file for hwclock. +.TP +.I /etc/localtime +The system timezone file. +.TP +.I /usr/share/zoneinfo/ +The system timezone database directory. +.PP +Device files +.B hwclock +may try for Hardware Clock access: +.br +.I /dev/rtc0 +.br +.I /dev/rtc +.br +.I /dev/misc/rtc +.br +.I /dev/efirtc +.br +.I /dev/misc/efirtc +.SH "SEE ALSO" +.BR date (1), +.BR adjtimex (8), +.BR gettimeofday (2), +.BR settimeofday (2), +.BR crontab (1), +.BR tzset (3) +. +.SH AUTHORS +Written by Bryan Henderson, September 1996 (bryanh@giraffe-data.com), +based on work done on the +.BR \%clock (8) +program by Charles Hedrick, Rob Hooft, and Harald Koenig. +See the source code for complete history and credits. +. +.SH AVAILABILITY +The hwclock command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/hwclock.c b/sys-utils/hwclock.c new file mode 100644 index 0000000..d9acbaf --- /dev/null +++ b/sys-utils/hwclock.c @@ -0,0 +1,1551 @@ +/* + * hwclock.c + * + * clock.c was written by Charles Hedrick, hedrick@cs.rutgers.edu, Apr 1992 + * Modified for clock adjustments - Rob Hooft <hooft@chem.ruu.nl>, Nov 1992 + * Improvements by Harald Koenig <koenig@nova.tat.physik.uni-tuebingen.de> + * and Alan Modra <alan@spri.levels.unisa.edu.au>. + * + * Major rewrite by Bryan Henderson <bryanh@giraffe-data.com>, 96.09.19. + * The new program is called hwclock. New features: + * + * - You can set the hardware clock without also modifying the system + * clock. + * - You can read and set the clock with finer than 1 second precision. + * - When you set the clock, hwclock automatically refigures the drift + * rate, based on how far off the clock was before you set it. + * + * Reshuffled things, added sparc code, and re-added alpha stuff + * by David Mosberger <davidm@azstarnet.com> + * and Jay Estabrook <jestabro@amt.tay1.dec.com> + * and Martin Ostermann <ost@coments.rwth-aachen.de>, aeb@cwi.nl, 990212. + * + * Fix for Award 2094 bug, Dave Coffin (dcoffin@shore.net) 11/12/98 + * Change of local time handling, Stefan Ring <e9725446@stud3.tuwien.ac.at> + * Change of adjtime handling, James P. Rutledge <ao112@rgfn.epcc.edu>. + * + * Distributed under GPL + */ +/* + * Explanation of `adjusting' (Rob Hooft): + * + * The problem with my machine is that its CMOS clock is 10 seconds + * per day slow. With this version of clock.c, and my '/etc/rc.local' + * reading '/etc/clock -au' instead of '/etc/clock -u -s', this error + * is automatically corrected at every boot. + * + * To do this job, the program reads and writes the file '/etc/adjtime' + * to determine the correction, and to save its data. In this file are + * three numbers: + * + * 1) the correction in seconds per day. (So if your clock runs 5 + * seconds per day fast, the first number should read -5.0) + * 2) the number of seconds since 1/1/1970 the last time the program + * was used + * 3) the remaining part of a second which was leftover after the last + * adjustment + * + * Installation and use of this program: + * + * a) create a file '/etc/adjtime' containing as the first and only + * line: '0.0 0 0.0' + * b) run 'clock -au' or 'clock -a', depending on whether your cmos is + * in universal or local time. This updates the second number. + * c) set your system time using the 'date' command. + * d) update your cmos time using 'clock -wu' or 'clock -w' + * e) replace the first number in /etc/adjtime by your correction. + * f) put the command 'clock -au' or 'clock -a' in your '/etc/rc.local' + */ + +#include <errno.h> +#include <getopt.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> + +#include "c.h" +#include "closestream.h" +#include "nls.h" +#include "optutils.h" +#include "pathnames.h" +#include "hwclock.h" +#include "timeutils.h" +#include "env.h" +#include "xalloc.h" +#include "path.h" +#include "strutils.h" + +#ifdef HAVE_LIBAUDIT +#include <libaudit.h> +static int hwaudit_fd = -1; +#endif + +UL_DEBUG_DEFINE_MASK(hwclock); +UL_DEBUG_DEFINE_MASKNAMES(hwclock) = UL_DEBUG_EMPTY_MASKNAMES; + +/* The struct that holds our hardware access routines */ +static struct clock_ops *ur; + +/* Maximal clock adjustment in seconds per day. + (adjtime() glibc call has 2145 seconds limit on i386, so it is good enough for us as well, + 43219 is a maximal safe value preventing exact_adjustment overflow.) */ +#define MAX_DRIFT 2145.0 + +struct adjtime { + /* + * This is information we keep in the adjtime file that tells us how + * to do drift corrections. Elements are all straight from the + * adjtime file, so see documentation of that file for details. + * Exception is <dirty>, which is an indication that what's in this + * structure is not what's in the disk file (because it has been + * updated since read from the disk file). + */ + int dirty; + /* line 1 */ + double drift_factor; + time_t last_adj_time; + double not_adjusted; + /* line 2 */ + time_t last_calib_time; + /* + * The most recent time that we set the clock from an external + * authority (as opposed to just doing a drift adjustment) + */ + /* line 3 */ + enum a_local_utc { UTC = 0, LOCAL, UNKNOWN } local_utc; + /* + * To which time zone, local or UTC, we most recently set the + * hardware clock. + */ +}; + +static void hwclock_init_debug(const char *str) +{ + __UL_INIT_DEBUG_FROM_STRING(hwclock, HWCLOCK_DEBUG_, 0, str); + + DBG(INIT, ul_debug("hwclock debug mask: 0x%04x", hwclock_debug_mask)); + DBG(INIT, ul_debug("hwclock version: %s", PACKAGE_STRING)); +} + +/* FOR TESTING ONLY: inject random delays of up to 1000ms */ +static void up_to_1000ms_sleep(void) +{ + int usec = random() % 1000000; + + DBG(RANDOM_SLEEP, ul_debug("sleeping ~%d usec", usec)); + xusleep(usec); +} + +/* + * time_t to timeval conversion. + */ +static struct timeval t2tv(time_t timet) +{ + struct timeval rettimeval; + + rettimeval.tv_sec = timet; + rettimeval.tv_usec = 0; + return rettimeval; +} + +/* + * The difference in seconds between two times in "timeval" format. + */ +double time_diff(struct timeval subtrahend, struct timeval subtractor) +{ + return (subtrahend.tv_sec - subtractor.tv_sec) + + (subtrahend.tv_usec - subtractor.tv_usec) / 1E6; +} + +/* + * The time, in "timeval" format, which is <increment> seconds after the + * time <addend>. Of course, <increment> may be negative. + */ +static struct timeval time_inc(struct timeval addend, double increment) +{ + struct timeval newtime; + + newtime.tv_sec = addend.tv_sec + (int)increment; + newtime.tv_usec = addend.tv_usec + (increment - (int)increment) * 1E6; + + /* + * Now adjust it so that the microsecond value is between 0 and 1 + * million. + */ + if (newtime.tv_usec < 0) { + newtime.tv_usec += 1E6; + newtime.tv_sec -= 1; + } else if (newtime.tv_usec >= 1E6) { + newtime.tv_usec -= 1E6; + newtime.tv_sec += 1; + } + return newtime; +} + +static int +hw_clock_is_utc(const struct hwclock_control *ctl, + const struct adjtime adjtime) +{ + int ret; + + if (ctl->utc) + ret = 1; /* --utc explicitly given on command line */ + else if (ctl->local_opt) + ret = 0; /* --localtime explicitly given */ + else + /* get info from adjtime file - default is UTC */ + ret = (adjtime.local_utc != LOCAL); + if (ctl->verbose) + printf(_("Assuming hardware clock is kept in %s time.\n"), + ret ? _("UTC") : _("local")); + return ret; +} + +/* + * Read the adjustment parameters out of the /etc/adjtime file. + * + * Return them as the adjtime structure <*adjtime_p>. Its defaults are + * initialized in main(). + */ +static int read_adjtime(const struct hwclock_control *ctl, + struct adjtime *adjtime_p) +{ + FILE *adjfile; + char line1[81]; /* String: first line of adjtime file */ + char line2[81]; /* String: second line of adjtime file */ + char line3[81]; /* String: third line of adjtime file */ + + if (access(ctl->adj_file_name, R_OK) != 0) + return EXIT_SUCCESS; + + adjfile = fopen(ctl->adj_file_name, "r"); /* open file for reading */ + if (adjfile == NULL) { + warn(_("cannot open %s"), ctl->adj_file_name); + return EXIT_FAILURE; + } + + if (!fgets(line1, sizeof(line1), adjfile)) + line1[0] = '\0'; /* In case fgets fails */ + if (!fgets(line2, sizeof(line2), adjfile)) + line2[0] = '\0'; /* In case fgets fails */ + if (!fgets(line3, sizeof(line3), adjfile)) + line3[0] = '\0'; /* In case fgets fails */ + + fclose(adjfile); + + sscanf(line1, "%lf %ld %lf", + &adjtime_p->drift_factor, + &adjtime_p->last_adj_time, + &adjtime_p->not_adjusted); + + sscanf(line2, "%ld", &adjtime_p->last_calib_time); + + if (!strcmp(line3, "UTC\n")) { + adjtime_p->local_utc = UTC; + } else if (!strcmp(line3, "LOCAL\n")) { + adjtime_p->local_utc = LOCAL; + } else { + adjtime_p->local_utc = UNKNOWN; + if (line3[0]) { + warnx(_("Warning: unrecognized third line in adjtime file\n" + "(Expected: `UTC' or `LOCAL' or nothing.)")); + } + } + + if (ctl->verbose) { + printf(_ + ("Last drift adjustment done at %ld seconds after 1969\n"), + (long)adjtime_p->last_adj_time); + printf(_("Last calibration done at %ld seconds after 1969\n"), + (long)adjtime_p->last_calib_time); + printf(_("Hardware clock is on %s time\n"), + (adjtime_p->local_utc == + LOCAL) ? _("local") : (adjtime_p->local_utc == + UTC) ? _("UTC") : _("unknown")); + } + + return EXIT_SUCCESS; +} + +/* + * Wait until the falling edge of the Hardware Clock's update flag so that + * any time that is read from the clock immediately after we return will be + * exact. + * + * The clock only has 1 second precision, so it gives the exact time only + * once per second, right on the falling edge of the update flag. + * + * We wait (up to one second) either blocked waiting for an rtc device or in + * a CPU spin loop. The former is probably not very accurate. + * + * Return 0 if it worked, nonzero if it didn't. + */ +static int synchronize_to_clock_tick(const struct hwclock_control *ctl) +{ + int rc; + + if (ctl->verbose) + printf(_("Waiting for clock tick...\n")); + + rc = ur->synchronize_to_clock_tick(ctl); + + if (ctl->verbose) { + if (rc) + printf(_("...synchronization failed\n")); + else + printf(_("...got clock tick\n")); + } + + return rc; +} + +/* + * Convert a time in broken down format (hours, minutes, etc.) into standard + * unix time (seconds into epoch). Return it as *systime_p. + * + * The broken down time is argument <tm>. This broken down time is either + * in local time zone or UTC, depending on value of logical argument + * "universal". True means it is in UTC. + * + * If the argument contains values that do not constitute a valid time, and + * mktime() recognizes this, return *valid_p == false and *systime_p + * undefined. However, mktime() sometimes goes ahead and computes a + * fictional time "as if" the input values were valid, e.g. if they indicate + * the 31st day of April, mktime() may compute the time of May 1. In such a + * case, we return the same fictional value mktime() does as *systime_p and + * return *valid_p == true. + */ +static int +mktime_tz(const struct hwclock_control *ctl, struct tm tm, + time_t *systime_p) +{ + int valid; + + if (ctl->universal) + *systime_p = timegm(&tm); + else + *systime_p = mktime(&tm); + if (*systime_p == -1) { + /* + * This apparently (not specified in mktime() documentation) + * means the 'tm' structure does not contain valid values + * (however, not containing valid values does _not_ imply + * mktime() returns -1). + */ + valid = 0; + if (ctl->verbose) + printf(_("Invalid values in hardware clock: " + "%4d/%.2d/%.2d %.2d:%.2d:%.2d\n"), + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + } else { + valid = 1; + if (ctl->verbose) + printf(_ + ("Hw clock time : %4d/%.2d/%.2d %.2d:%.2d:%.2d = " + "%ld seconds since 1969\n"), tm.tm_year + 1900, + tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, + tm.tm_sec, (long)*systime_p); + } + return valid; +} + +/* + * Read the hardware clock and return the current time via <tm> argument. + * + * Use the method indicated by <method> argument to access the hardware + * clock. + */ +static int +read_hardware_clock(const struct hwclock_control *ctl, + int *valid_p, time_t *systime_p) +{ + struct tm tm; + int err; + + err = ur->read_hardware_clock(ctl, &tm); + if (err) + return err; + + if (ctl->verbose) + printf(_ + ("Time read from Hardware Clock: %4d/%.2d/%.2d %02d:%02d:%02d\n"), + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, + tm.tm_min, tm.tm_sec); + *valid_p = mktime_tz(ctl, tm, systime_p); + + return 0; +} + +/* + * Set the Hardware Clock to the time <newtime>, in local time zone or UTC, + * according to <universal>. + */ +static void +set_hardware_clock(const struct hwclock_control *ctl, const time_t newtime) +{ + struct tm new_broken_time; + /* + * Time to which we will set Hardware Clock, in broken down format, + * in the time zone of caller's choice + */ + + if (ctl->universal) + gmtime_r(&newtime, &new_broken_time); + else + localtime_r(&newtime, &new_broken_time); + + if (ctl->verbose) + printf(_("Setting Hardware Clock to %.2d:%.2d:%.2d " + "= %ld seconds since 1969\n"), + new_broken_time.tm_hour, new_broken_time.tm_min, + new_broken_time.tm_sec, (long)newtime); + + if (!ctl->testing) + ur->set_hardware_clock(ctl, &new_broken_time); +} + +static double +get_hardware_delay(const struct hwclock_control *ctl) +{ + const char *devpath, *rtcname; + char name[128 + 1]; + struct path_cxt *pc; + int rc; + + devpath = ur->get_device_path(); + if (!devpath) + goto unknown; + + rtcname = strrchr(devpath, '/'); + if (!rtcname || !*(rtcname + 1)) + goto unknown; + rtcname++; + + pc = ul_new_path("/sys/class/rtc/%s", rtcname); + if (!pc) + goto unknown; + rc = ul_path_scanf(pc, "name", "%128[^\n ]", &name); + ul_unref_path(pc); + + if (rc != 1 || !*name) + goto unknown; + + if (ctl->verbose) + printf(_("RTC type: '%s'\n"), name); + + /* MC146818A-compatible (x86) */ + if (strcmp(name, "rtc_cmos") == 0) + return 0.5; + + /* Another HW */ + return 0; +unknown: + /* Let's be backwardly compatible */ + return 0.5; +} + + +/* + * Set the Hardware Clock to the time "sethwtime", in local time zone or + * UTC, according to "universal". + * + * Wait for a fraction of a second so that "sethwtime" is the value of the + * Hardware Clock as of system time "refsystime", which is in the past. For + * example, if "sethwtime" is 14:03:05 and "refsystime" is 12:10:04.5 and + * the current system time is 12:10:06.0: Wait .5 seconds (to make exactly 2 + * seconds since "refsystime") and then set the Hardware Clock to 14:03:07, + * thus getting a precise and retroactive setting of the clock. The .5 delay is + * default on x86, see --delay and get_hardware_delay(). + * + * (Don't be confused by the fact that the system clock and the Hardware + * Clock differ by two hours in the above example. That's just to remind you + * that there are two independent time scales here). + * + * This function ought to be able to accept set times as fractional times. + * Idea for future enhancement. + */ +static void +set_hardware_clock_exact(const struct hwclock_control *ctl, + const time_t sethwtime, + const struct timeval refsystime) +{ + /* + * The Hardware Clock can only be set to any integer time plus one + * half second. The integer time is required because there is no + * interface to set or get a fractional second. The additional half + * second is because the Hardware Clock updates to the following + * second precisely 500 ms (not 1 second!) after you release the + * divider reset (after setting the new time) - see description of + * DV2, DV1, DV0 in Register A in the MC146818A data sheet (and note + * that although that document doesn't say so, real-world code seems + * to expect that the SET bit in Register B functions the same way). + * That means that, e.g., when you set the clock to 1:02:03, it + * effectively really sets it to 1:02:03.5, because it will update to + * 1:02:04 only half a second later. Our caller passes the desired + * integer Hardware Clock time in sethwtime, and the corresponding + * system time (which may have a fractional part, and which may or may + * not be the same!) in refsystime. In an ideal situation, we would + * then apply sethwtime to the Hardware Clock at refsystime+500ms, so + * that when the Hardware Clock ticks forward to sethwtime+1s half a + * second later at refsystime+1000ms, everything is in sync. So we + * spin, waiting for gettimeofday() to return a time at or after that + * time (refsystime+500ms) up to a tolerance value, initially 1ms. If + * we miss that time due to being preempted for some other process, + * then we increase the margin a little bit (initially 1ms, doubling + * each time), add 1 second (or more, if needed to get a time that is + * in the future) to both the time for which we are waiting and the + * time that we will apply to the Hardware Clock, and start waiting + * again. + * + * For example, the caller requests that we set the Hardware Clock to + * 1:02:03, with reference time (current system time) = 6:07:08.250. + * We want the Hardware Clock to update to 1:02:04 at 6:07:09.250 on + * the system clock, and the first such update will occur 0.500 + * seconds after we write to the Hardware Clock, so we spin until the + * system clock reads 6:07:08.750. If we get there, great, but let's + * imagine the system is so heavily loaded that our process is + * preempted and by the time we get to run again, the system clock + * reads 6:07:11.990. We now want to wait until the next xx:xx:xx.750 + * time, which is 6:07:12.750 (4.5 seconds after the reference time), + * at which point we will set the Hardware Clock to 1:02:07 (4 seconds + * after the originally requested time). If we do that successfully, + * then at 6:07:13.250 (5 seconds after the reference time), the + * Hardware Clock will update to 1:02:08 (5 seconds after the + * originally requested time), and all is well thereafter. + */ + + time_t newhwtime = sethwtime; + double target_time_tolerance_secs = 0.001; /* initial value */ + double tolerance_incr_secs = 0.001; /* initial value */ + double delay; + struct timeval rtc_set_delay_tv; + + struct timeval targetsystime; + struct timeval nowsystime; + struct timeval prevsystime = refsystime; + double deltavstarget; + + if (ctl->rtc_delay != -1.0) /* --delay specified */ + delay = ctl->rtc_delay; + else + delay = get_hardware_delay(ctl); + + if (ctl->verbose) + printf(_("Using delay: %.6f seconds\n"), delay); + + rtc_set_delay_tv.tv_sec = 0; + rtc_set_delay_tv.tv_usec = delay * 1E6; + + timeradd(&refsystime, &rtc_set_delay_tv, &targetsystime); + + while (1) { + double ticksize; + + ON_DBG(RANDOM_SLEEP, up_to_1000ms_sleep()); + + gettimeofday(&nowsystime, NULL); + deltavstarget = time_diff(nowsystime, targetsystime); + ticksize = time_diff(nowsystime, prevsystime); + prevsystime = nowsystime; + + if (ticksize < 0) { + if (ctl->verbose) + printf(_("time jumped backward %.6f seconds " + "to %ld.%06ld - retargeting\n"), + ticksize, nowsystime.tv_sec, + nowsystime.tv_usec); + /* The retarget is handled at the end of the loop. */ + } else if (deltavstarget < 0) { + /* deltavstarget < 0 if current time < target time */ + DBG(DELTA_VS_TARGET, + ul_debug("%ld.%06ld < %ld.%06ld (%.6f)", + nowsystime.tv_sec, nowsystime.tv_usec, + targetsystime.tv_sec, + targetsystime.tv_usec, deltavstarget)); + continue; /* not there yet - keep spinning */ + } else if (deltavstarget <= target_time_tolerance_secs) { + /* Close enough to the target time; done waiting. */ + break; + } else /* (deltavstarget > target_time_tolerance_secs) */ { + /* + * We missed our window. Increase the tolerance and + * aim for the next opportunity. + */ + if (ctl->verbose) + printf(_("missed it - %ld.%06ld is too far " + "past %ld.%06ld (%.6f > %.6f)\n"), + nowsystime.tv_sec, + nowsystime.tv_usec, + targetsystime.tv_sec, + targetsystime.tv_usec, + deltavstarget, + target_time_tolerance_secs); + target_time_tolerance_secs += tolerance_incr_secs; + tolerance_incr_secs *= 2; + } + + /* + * Aim for the same offset (tv_usec) within the second in + * either the current second (if that offset hasn't arrived + * yet), or the next second. + */ + if (nowsystime.tv_usec < targetsystime.tv_usec) + targetsystime.tv_sec = nowsystime.tv_sec; + else + targetsystime.tv_sec = nowsystime.tv_sec + 1; + } + + newhwtime = sethwtime + + (int)(time_diff(nowsystime, refsystime) + - delay /* don't count this */ + + 0.5 /* for rounding */); + if (ctl->verbose) + printf(_("%ld.%06ld is close enough to %ld.%06ld (%.6f < %.6f)\n" + "Set RTC to %ld (%ld + %d; refsystime = %ld.%06ld)\n"), + nowsystime.tv_sec, nowsystime.tv_usec, + targetsystime.tv_sec, targetsystime.tv_usec, + deltavstarget, target_time_tolerance_secs, + newhwtime, sethwtime, + (int)(newhwtime - sethwtime), + refsystime.tv_sec, refsystime.tv_usec); + + set_hardware_clock(ctl, newhwtime); +} + +static int +display_time(struct timeval hwctime) +{ + char buf[ISO_BUFSIZ]; + + if (strtimeval_iso(&hwctime, ISO_TIMESTAMP_DOT, buf, sizeof(buf))) + return EXIT_FAILURE; + + printf("%s\n", buf); + return EXIT_SUCCESS; +} + +/* + * Adjusts System time, sets the kernel's timezone and RTC timescale. + * + * The kernel warp_clock function adjusts the System time according to the + * tz.tz_minuteswest argument and sets PCIL (see below). At boot settimeofday(2) + * has one-shot access to this function as shown in the table below. + * + * +-------------------------------------------------------------------+ + * | settimeofday(tv, tz) | + * |-------------------------------------------------------------------| + * | Arguments | System Time | PCIL | | warp_clock | + * | tv | tz | set | warped | set | firsttime | locked | + * |---------|---------|---------------|------|-----------|------------| + * | pointer | NULL | yes | no | no | 1 | no | + * | pointer | pointer | yes | no | no | 0 | yes | + * | NULL | ptr2utc | no | no | no | 0 | yes | + * | NULL | pointer | no | yes | yes | 0 | yes | + * +-------------------------------------------------------------------+ + * ptr2utc: tz.tz_minuteswest is zero (UTC). + * PCIL: persistent_clock_is_local, sets the "11 minute mode" timescale. + * firsttime: locks the warp_clock function (initialized to 1 at boot). + * + * +---------------------------------------------------------------------------+ + * | op | RTC scale | settimeofday calls | + * |---------|-----------|-----------------------------------------------------| + * | systz | Local | 1) warps system time*, sets PCIL* and kernel tz | + * | systz | UTC | 1st) locks warp_clock* 2nd) sets kernel tz | + * | hctosys | Local | 1st) sets PCIL* 2nd) sets system time and kernel tz | + * | hctosys | UTC | 1) sets system time and kernel tz | + * +---------------------------------------------------------------------------+ + * * only on first call after boot + */ +static int +set_system_clock(const struct hwclock_control *ctl, + const struct timeval newtime) +{ + struct tm broken; + int minuteswest; + int rc = 0; + const struct timezone tz_utc = { 0 }; + + localtime_r(&newtime.tv_sec, &broken); + minuteswest = -get_gmtoff(&broken) / 60; + + if (ctl->verbose) { + if (ctl->hctosys && !ctl->universal) + printf(_("Calling settimeofday(NULL, %d) to set " + "persistent_clock_is_local.\n"), minuteswest); + if (ctl->systz && ctl->universal) + puts(_("Calling settimeofday(NULL, 0) " + "to lock the warp function.")); + if (ctl->hctosys) + printf(_("Calling settimeofday(%ld.%06ld, %d)\n"), + newtime.tv_sec, newtime.tv_usec, minuteswest); + else { + printf(_("Calling settimeofday(NULL, %d) "), minuteswest); + if (ctl->universal) + puts(_("to set the kernel timezone.")); + else + puts(_("to warp System time.")); + } + } + + if (!ctl->testing) { + const struct timezone tz = { minuteswest }; + + if (ctl->hctosys && !ctl->universal) /* set PCIL */ + rc = settimeofday(NULL, &tz); + if (ctl->systz && ctl->universal) /* lock warp_clock */ + rc = settimeofday(NULL, &tz_utc); + if (!rc && ctl->hctosys) + rc = settimeofday(&newtime, &tz); + else if (!rc) + rc = settimeofday(NULL, &tz); + + if (rc) { + warn(_("settimeofday() failed")); + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +/* + * Refresh the last calibrated and last adjusted timestamps in <*adjtime_p> + * to facilitate future drift calculations based on this set point. + * + * With the --update-drift option: + * Update the drift factor in <*adjtime_p> based on the fact that the + * Hardware Clock was just calibrated to <nowtime> and before that was + * set to the <hclocktime> time scale. + */ +static void +adjust_drift_factor(const struct hwclock_control *ctl, + struct adjtime *adjtime_p, + const struct timeval nowtime, + const struct timeval hclocktime) +{ + if (!ctl->update) { + if (ctl->verbose) + printf(_("Not adjusting drift factor because the " + "--update-drift option was not used.\n")); + } else if (adjtime_p->last_calib_time == 0) { + if (ctl->verbose) + printf(_("Not adjusting drift factor because last " + "calibration time is zero,\n" + "so history is bad and calibration startover " + "is necessary.\n")); + } else if ((hclocktime.tv_sec - adjtime_p->last_calib_time) < 4 * 60 * 60) { + if (ctl->verbose) + printf(_("Not adjusting drift factor because it has " + "been less than four hours since the last " + "calibration.\n")); + } else { + /* + * At adjustment time we drift correct the hardware clock + * according to the contents of the adjtime file and refresh + * its last adjusted timestamp. + * + * At calibration time we set the Hardware Clock and refresh + * both timestamps in <*adjtime_p>. + * + * Here, with the --update-drift option, we also update the + * drift factor in <*adjtime_p>. + * + * Let us do computation in doubles. (Floats almost suffice, + * but 195 days + 1 second equals 195 days in floats.) + */ + const double sec_per_day = 24.0 * 60.0 * 60.0; + double factor_adjust; + double drift_factor; + struct timeval last_calib; + + last_calib = t2tv(adjtime_p->last_calib_time); + /* + * Correction to apply to the current drift factor. + * + * Simplified: uncorrected_drift / days_since_calibration. + * + * hclocktime is fully corrected with the current drift factor. + * Its difference from nowtime is the missed drift correction. + */ + factor_adjust = time_diff(nowtime, hclocktime) / + (time_diff(nowtime, last_calib) / sec_per_day); + + drift_factor = adjtime_p->drift_factor + factor_adjust; + if (fabs(drift_factor) > MAX_DRIFT) { + if (ctl->verbose) + printf(_("Clock drift factor was calculated as " + "%f seconds/day.\n" + "It is far too much. Resetting to zero.\n"), + drift_factor); + drift_factor = 0; + } else { + if (ctl->verbose) + printf(_("Clock drifted %f seconds in the past " + "%f seconds\nin spite of a drift factor of " + "%f seconds/day.\n" + "Adjusting drift factor by %f seconds/day\n"), + time_diff(nowtime, hclocktime), + time_diff(nowtime, last_calib), + adjtime_p->drift_factor, factor_adjust); + } + + adjtime_p->drift_factor = drift_factor; + } + adjtime_p->last_calib_time = nowtime.tv_sec; + + adjtime_p->last_adj_time = nowtime.tv_sec; + + adjtime_p->not_adjusted = 0; + + adjtime_p->dirty = 1; +} + +/* + * Calculate the drift correction currently needed for the + * Hardware Clock based on the last time it was adjusted, + * and the current drift factor, as stored in the adjtime file. + * + * The total drift adjustment needed is stored at tdrift_p. + * + */ +static void +calculate_adjustment(const struct hwclock_control *ctl, + const double factor, + const time_t last_time, + const double not_adjusted, + const time_t systime, struct timeval *tdrift_p) +{ + double exact_adjustment; + + exact_adjustment = + ((double)(systime - last_time)) * factor / (24 * 60 * 60) + + not_adjusted; + tdrift_p->tv_sec = (time_t) floor(exact_adjustment); + tdrift_p->tv_usec = (exact_adjustment - + (double)tdrift_p->tv_sec) * 1E6; + if (ctl->verbose) { + printf(P_("Time since last adjustment is %ld second\n", + "Time since last adjustment is %ld seconds\n", + (systime - last_time)), + (systime - last_time)); + printf(_("Calculated Hardware Clock drift is %ld.%06ld seconds\n"), + tdrift_p->tv_sec, tdrift_p->tv_usec); + } +} + +/* + * Write the contents of the <adjtime> structure to its disk file. + * + * But if the contents are clean (unchanged since read from disk), don't + * bother. + */ +static int save_adjtime(const struct hwclock_control *ctl, + const struct adjtime *adjtime) +{ + char *content; /* Stuff to write to disk file */ + FILE *fp; + + xasprintf(&content, "%f %ld %f\n%ld\n%s\n", + adjtime->drift_factor, + adjtime->last_adj_time, + adjtime->not_adjusted, + adjtime->last_calib_time, + (adjtime->local_utc == LOCAL) ? "LOCAL" : "UTC"); + + if (ctl->verbose){ + printf(_("New %s data:\n%s"), + ctl->adj_file_name, content); + } + + if (!ctl->testing) { + fp = fopen(ctl->adj_file_name, "w"); + if (fp == NULL) { + warn(_("cannot open %s"), ctl->adj_file_name); + return EXIT_FAILURE; + } else if (fputs(content, fp) < 0 || close_stream(fp) != 0) { + warn(_("cannot update %s"), ctl->adj_file_name); + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +/* + * Do the adjustment requested, by 1) setting the Hardware Clock (if + * necessary), and 2) updating the last-adjusted time in the adjtime + * structure. + * + * Do not update anything if the Hardware Clock does not currently present a + * valid time. + * + * <hclocktime> is the drift corrected time read from the Hardware Clock. + * + * <read_time> was the system time when the <hclocktime> was read, which due + * to computational delay could be a short time ago. It is used to define a + * trigger point for setting the Hardware Clock. The fractional part of the + * Hardware clock set time is subtracted from read_time to 'refer back', or + * delay, the trigger point. Fractional parts must be accounted for in this + * way, because the Hardware Clock can only be set to a whole second. + * + * <universal>: the Hardware Clock is kept in UTC. + * + * <testing>: We are running in test mode (no updating of clock). + * + */ +static void +do_adjustment(const struct hwclock_control *ctl, struct adjtime *adjtime_p, + const struct timeval hclocktime, + const struct timeval read_time) +{ + if (adjtime_p->last_adj_time == 0) { + if (ctl->verbose) + printf(_("Not setting clock because last adjustment time is zero, " + "so history is bad.\n")); + } else if (fabs(adjtime_p->drift_factor) > MAX_DRIFT) { + if (ctl->verbose) + printf(_("Not setting clock because drift factor %f is far too high.\n"), + adjtime_p->drift_factor); + } else { + set_hardware_clock_exact(ctl, hclocktime.tv_sec, + time_inc(read_time, + -(hclocktime.tv_usec / 1E6))); + adjtime_p->last_adj_time = hclocktime.tv_sec; + adjtime_p->not_adjusted = 0; + adjtime_p->dirty = 1; + } +} + +static void determine_clock_access_method(const struct hwclock_control *ctl) +{ + ur = NULL; + + if (ctl->directisa) + ur = probe_for_cmos_clock(); +#ifdef __linux__ + if (!ur) + ur = probe_for_rtc_clock(ctl); +#endif + if (ur) { + if (ctl->verbose) + puts(ur->interface_name); + + } else { + if (ctl->verbose) + printf(_("No usable clock interface found.\n")); + warnx(_("Cannot access the Hardware Clock via " + "any known method.")); + if (!ctl->verbose) + warnx(_("Use the --verbose option to see the " + "details of our search for an access " + "method.")); + hwclock_exit(ctl, EXIT_FAILURE); + } +} + +/* Do all the normal work of hwclock - read, set clock, etc. */ +static int +manipulate_clock(const struct hwclock_control *ctl, const time_t set_time, + const struct timeval startup_time, struct adjtime *adjtime) +{ + /* The time at which we read the Hardware Clock */ + struct timeval read_time; + /* + * The Hardware Clock gives us a valid time, or at + * least something close enough to fool mktime(). + */ + int hclock_valid = 0; + /* + * Tick synchronized time read from the Hardware Clock and + * then drift corrected for all operations except --show. + */ + struct timeval hclocktime = { 0 }; + /* + * hclocktime correlated to startup_time. That is, what drift + * corrected Hardware Clock time would have been at start up. + */ + struct timeval startup_hclocktime = { 0 }; + /* Total Hardware Clock drift correction needed. */ + struct timeval tdrift; + + if ((ctl->set || ctl->systohc || ctl->adjust) && + (adjtime->local_utc == UTC) != ctl->universal) { + adjtime->local_utc = ctl->universal ? UTC : LOCAL; + adjtime->dirty = 1; + } + /* + * Negate the drift correction, because we want to 'predict' a + * Hardware Clock time that includes drift. + */ + if (ctl->predict) { + hclocktime = t2tv(set_time); + calculate_adjustment(ctl, adjtime->drift_factor, + adjtime->last_adj_time, + adjtime->not_adjusted, + hclocktime.tv_sec, &tdrift); + hclocktime = time_inc(hclocktime, (double) + -(tdrift.tv_sec + tdrift.tv_usec / 1E6)); + if (ctl->verbose) { + printf(_ ("Target date: %ld\n"), set_time); + printf(_ ("Predicted RTC: %ld\n"), hclocktime.tv_sec); + } + return display_time(hclocktime); + } + + if (ctl->systz) + return set_system_clock(ctl, startup_time); + + if (ur->get_permissions()) + return EXIT_FAILURE; + + /* + * Read and drift correct RTC time; except for RTC set functions + * without the --update-drift option because: 1) it's not needed; + * 2) it enables setting a corrupted RTC without reading it first; + * 3) it significantly reduces system shutdown time. + */ + if ( ! ((ctl->set || ctl->systohc) && !ctl->update)) { + /* + * Timing critical - do not change the order of, or put + * anything between the follow three statements. + * Synchronization failure MUST exit, because all drift + * operations are invalid without it. + */ + if (synchronize_to_clock_tick(ctl)) + return EXIT_FAILURE; + read_hardware_clock(ctl, &hclock_valid, &hclocktime.tv_sec); + gettimeofday(&read_time, NULL); + + if (!hclock_valid) { + warnx(_("RTC read returned an invalid value.")); + return EXIT_FAILURE; + } + /* + * Calculate and apply drift correction to the Hardware Clock + * time for everything except --show + */ + calculate_adjustment(ctl, adjtime->drift_factor, + adjtime->last_adj_time, + adjtime->not_adjusted, + hclocktime.tv_sec, &tdrift); + if (!ctl->show) + hclocktime = time_inc(tdrift, hclocktime.tv_sec); + + startup_hclocktime = + time_inc(hclocktime, time_diff(startup_time, read_time)); + } + if (ctl->show || ctl->get) { + return display_time(startup_hclocktime); + } else if (ctl->set) { + set_hardware_clock_exact(ctl, set_time, startup_time); + if (!ctl->noadjfile) + adjust_drift_factor(ctl, adjtime, t2tv(set_time), + startup_hclocktime); + } else if (ctl->adjust) { + if (tdrift.tv_sec > 0 || tdrift.tv_sec < -1) + do_adjustment(ctl, adjtime, hclocktime, read_time); + else + printf(_("Needed adjustment is less than one second, " + "so not setting clock.\n")); + } else if (ctl->systohc) { + struct timeval nowtime, reftime; + /* + * We can only set_hardware_clock_exact to a + * whole seconds time, so we set it with + * reference to the most recent whole + * seconds time. + */ + gettimeofday(&nowtime, NULL); + reftime.tv_sec = nowtime.tv_sec; + reftime.tv_usec = 0; + set_hardware_clock_exact(ctl, (time_t) reftime.tv_sec, reftime); + if (!ctl->noadjfile) + adjust_drift_factor(ctl, adjtime, nowtime, + hclocktime); + } else if (ctl->hctosys) { + return set_system_clock(ctl, hclocktime); + } + if (!ctl->noadjfile && adjtime->dirty) + return save_adjtime(ctl, adjtime); + return EXIT_SUCCESS; +} + +/** + * Get or set the kernel RTC driver's epoch on Alpha machines. + * ISA machines are hard coded for 1900. + */ +#if defined(__linux__) && defined(__alpha__) +static void +manipulate_epoch(const struct hwclock_control *ctl) +{ + if (ctl->getepoch) { + unsigned long epoch; + + if (get_epoch_rtc(ctl, &epoch)) + warnx(_("unable to read the RTC epoch.")); + else + printf(_("The RTC epoch is set to %lu.\n"), epoch); + } else if (ctl->setepoch) { + if (!ctl->epoch_option) + warnx(_("--epoch is required for --setepoch.")); + else if (!ctl->testing) + if (set_epoch_rtc(ctl)) + warnx(_("unable to set the RTC epoch.")); + } +} +#endif /* __linux__ __alpha__ */ + +static void out_version(void) +{ + printf(UTIL_LINUX_VERSION); +} + +static void __attribute__((__noreturn__)) +usage(void) +{ + fputs(USAGE_HEADER, stdout); + printf(_(" %s [function] [option...]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, stdout); + puts(_("Time clocks utility.")); + + fputs(USAGE_FUNCTIONS, stdout); + puts(_(" -r, --show display the RTC time")); + puts(_(" --get display drift corrected RTC time")); + puts(_(" --set set the RTC according to --date")); + puts(_(" -s, --hctosys set the system time from the RTC")); + puts(_(" -w, --systohc set the RTC from the system time")); + puts(_(" --systz send timescale configurations to the kernel")); + puts(_(" -a, --adjust adjust the RTC to account for systematic drift")); +#if defined(__linux__) && defined(__alpha__) + puts(_(" --getepoch display the RTC epoch")); + puts(_(" --setepoch set the RTC epoch according to --epoch")); +#endif + puts(_(" --predict predict the drifted RTC time according to --date")); + fputs(USAGE_OPTIONS, stdout); + puts(_(" -u, --utc the RTC timescale is UTC")); + puts(_(" -l, --localtime the RTC timescale is Local")); +#ifdef __linux__ + printf(_( + " -f, --rtc <file> use an alternate file to %1$s\n"), _PATH_RTC_DEV); +#endif + printf(_( + " --directisa use the ISA bus instead of %1$s access\n"), _PATH_RTC_DEV); + puts(_(" --date <time> date/time input for --set and --predict")); + puts(_(" --delay <sec> delay used when set new RTC time")); +#if defined(__linux__) && defined(__alpha__) + puts(_(" --epoch <year> epoch input for --setepoch")); +#endif + puts(_(" --update-drift update the RTC drift factor")); + printf(_( + " --noadjfile do not use %1$s\n"), _PATH_ADJTIME); + printf(_( + " --adjfile <file> use an alternate file to %1$s\n"), _PATH_ADJTIME); + puts(_(" --test dry run; implies --verbose")); + puts(_(" -v, --verbose display more details")); + fputs(USAGE_SEPARATOR, stdout); + printf(USAGE_HELP_OPTIONS(22)); + printf(USAGE_MAN_TAIL("hwclock(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + struct hwclock_control ctl = { + .show = 1, /* default op is show */ + .rtc_delay = -1.0 /* unspecified */ + }; + struct timeval startup_time; + struct adjtime adjtime = { 0 }; + struct timespec when = { 0 }; + /* + * The time we started up, in seconds into the epoch, including + * fractions. + */ + time_t set_time = 0; /* Time to which user said to set Hardware Clock */ + int rc, c; + + /* Long only options. */ + enum { + OPT_ADJFILE = CHAR_MAX + 1, + OPT_DATE, + OPT_DELAY, + OPT_DIRECTISA, + OPT_EPOCH, + OPT_GET, + OPT_GETEPOCH, + OPT_NOADJFILE, + OPT_PREDICT, + OPT_SET, + OPT_SETEPOCH, + OPT_SYSTZ, + OPT_TEST, + OPT_UPDATE + }; + + static const struct option longopts[] = { + { "adjust", no_argument, NULL, 'a' }, + { "help", no_argument, NULL, 'h' }, + { "localtime", no_argument, NULL, 'l' }, + { "show", no_argument, NULL, 'r' }, + { "hctosys", no_argument, NULL, 's' }, + { "utc", no_argument, NULL, 'u' }, + { "version", no_argument, NULL, 'V' }, + { "systohc", no_argument, NULL, 'w' }, + { "debug", no_argument, NULL, 'D' }, + { "ul-debug", required_argument, NULL, 'd' }, + { "verbose", no_argument, NULL, 'v' }, + { "set", no_argument, NULL, OPT_SET }, +#if defined(__linux__) && defined(__alpha__) + { "getepoch", no_argument, NULL, OPT_GETEPOCH }, + { "setepoch", no_argument, NULL, OPT_SETEPOCH }, + { "epoch", required_argument, NULL, OPT_EPOCH }, +#endif + { "noadjfile", no_argument, NULL, OPT_NOADJFILE }, + { "directisa", no_argument, NULL, OPT_DIRECTISA }, + { "test", no_argument, NULL, OPT_TEST }, + { "date", required_argument, NULL, OPT_DATE }, + { "delay", required_argument, NULL, OPT_DELAY }, +#ifdef __linux__ + { "rtc", required_argument, NULL, 'f' }, +#endif + { "adjfile", required_argument, NULL, OPT_ADJFILE }, + { "systz", no_argument, NULL, OPT_SYSTZ }, + { "predict", no_argument, NULL, OPT_PREDICT }, + { "get", no_argument, NULL, OPT_GET }, + { "update-drift", no_argument, NULL, OPT_UPDATE }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'a','r','s','w', + OPT_GET, OPT_GETEPOCH, OPT_PREDICT, + OPT_SET, OPT_SETEPOCH, OPT_SYSTZ }, + { 'l', 'u' }, + { OPT_ADJFILE, OPT_NOADJFILE }, + { OPT_NOADJFILE, OPT_UPDATE }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + /* Remember what time we were invoked */ + gettimeofday(&startup_time, NULL); + +#ifdef HAVE_LIBAUDIT + hwaudit_fd = audit_open(); + if (hwaudit_fd < 0 && !(errno == EINVAL || errno == EPROTONOSUPPORT || + errno == EAFNOSUPPORT)) { + /* + * You get these error codes only when the kernel doesn't + * have audit compiled in. + */ + warnx(_("Unable to connect to audit system")); + return EXIT_FAILURE; + } +#endif + setlocale(LC_ALL, ""); +#ifdef LC_NUMERIC + /* + * We need LC_CTYPE and LC_TIME and LC_MESSAGES, but must avoid + * LC_NUMERIC since it gives problems when we write to /etc/adjtime. + * - gqueri@mail.dotcom.fr + */ + setlocale(LC_NUMERIC, "C"); +#endif + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, + "hvVDd:alrsuwf:", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'D': + warnx(_("use --verbose, --debug has been deprecated.")); + break; + case 'v': + ctl.verbose = 1; + break; + case 'd': + hwclock_init_debug(optarg); + break; + case 'a': + ctl.adjust = 1; + ctl.show = 0; + ctl.hwaudit_on = 1; + break; + case 'l': + ctl.local_opt = 1; /* --localtime */ + break; + case 'r': + ctl.show = 1; + break; + case 's': + ctl.hctosys = 1; + ctl.show = 0; + ctl.hwaudit_on = 1; + break; + case 'u': + ctl.utc = 1; + break; + case 'w': + ctl.systohc = 1; + ctl.show = 0; + ctl.hwaudit_on = 1; + break; + case OPT_SET: + ctl.set = 1; + ctl.show = 0; + ctl.hwaudit_on = 1; + break; +#if defined(__linux__) && defined(__alpha__) + case OPT_GETEPOCH: + ctl.getepoch = 1; + ctl.show = 0; + break; + case OPT_SETEPOCH: + ctl.setepoch = 1; + ctl.show = 0; + ctl.hwaudit_on = 1; + break; + case OPT_EPOCH: + ctl.epoch_option = optarg; /* --epoch */ + break; +#endif + case OPT_NOADJFILE: + ctl.noadjfile = 1; + break; + case OPT_DIRECTISA: + ctl.directisa = 1; + break; + case OPT_TEST: + ctl.testing = 1; /* --test */ + ctl.verbose = 1; + break; + case OPT_DATE: + ctl.date_opt = optarg; /* --date */ + break; + case OPT_DELAY: + ctl.rtc_delay = strtod_or_err(optarg, "invalid --delay argument"); + break; + case OPT_ADJFILE: + ctl.adj_file_name = optarg; /* --adjfile */ + break; + case OPT_SYSTZ: + ctl.systz = 1; /* --systz */ + ctl.show = 0; + ctl.hwaudit_on = 1; + break; + case OPT_PREDICT: + ctl.predict = 1; /* --predict */ + ctl.show = 0; + break; + case OPT_GET: + ctl.get = 1; /* --get */ + ctl.show = 0; + break; + case OPT_UPDATE: + ctl.update = 1; /* --update-drift */ + break; +#ifdef __linux__ + case 'f': + ctl.rtc_dev_name = optarg; /* --rtc */ + break; +#endif + case 'V': /* --version */ + out_version(); + return 0; + case 'h': /* --help */ + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (argc -= optind) { + warnx(_("%d too many arguments given"), argc); + errtryhelp(EXIT_FAILURE); + } + + if (!ctl.adj_file_name) + ctl.adj_file_name = _PATH_ADJTIME; + + if (ctl.update && !ctl.set && !ctl.systohc) { + warnx(_("--update-drift requires --set or --systohc")); + exit(EXIT_FAILURE); + } + + if (ctl.noadjfile && !ctl.utc && !ctl.local_opt) { + warnx(_("With --noadjfile, you must specify " + "either --utc or --localtime")); + exit(EXIT_FAILURE); + } + + if (ctl.set || ctl.predict) { + if (!ctl.date_opt) { + warnx(_("--date is required for --set or --predict")); + exit(EXIT_FAILURE); + } + if (parse_date(&when, ctl.date_opt, NULL)) + set_time = when.tv_sec; + else { + warnx(_("invalid date '%s'"), ctl.date_opt); + exit(EXIT_FAILURE); + } + } + +#if defined(__linux__) && defined(__alpha__) + if (ctl.getepoch || ctl.setepoch) { + manipulate_epoch(&ctl); + hwclock_exit(&ctl, EXIT_SUCCESS); + } +#endif + + if (ctl.verbose) { + out_version(); + printf(_("System Time: %ld.%06ld\n"), + startup_time.tv_sec, startup_time.tv_usec); + } + + if (!ctl.systz && !ctl.predict) + determine_clock_access_method(&ctl); + + if (!ctl.noadjfile && !(ctl.systz && (ctl.utc || ctl.local_opt))) { + if ((rc = read_adjtime(&ctl, &adjtime)) != 0) + hwclock_exit(&ctl, rc); + } else + /* Avoid writing adjtime file if we don't have to. */ + adjtime.dirty = 0; + ctl.universal = hw_clock_is_utc(&ctl, adjtime); + rc = manipulate_clock(&ctl, set_time, startup_time, &adjtime); + if (ctl.testing) + puts(_("Test mode: nothing was changed.")); + hwclock_exit(&ctl, rc); + return rc; /* Not reached */ +} + +void +hwclock_exit(const struct hwclock_control *ctl +#ifndef HAVE_LIBAUDIT + __attribute__((__unused__)) +#endif + , int status) +{ +#ifdef HAVE_LIBAUDIT + if (ctl->hwaudit_on && !ctl->testing) { + audit_log_user_message(hwaudit_fd, AUDIT_USYS_CONFIG, + "op=change-system-time", NULL, NULL, NULL, + status); + } + close(hwaudit_fd); +#endif + exit(status); +} + +/* + * History of this program: + * + * 98.08.12 BJH Version 2.4 + * + * Don't use century byte from Hardware Clock. Add comments telling why. + * + * 98.06.20 BJH Version 2.3. + * + * Make --hctosys set the kernel timezone from TZ environment variable + * and/or /usr/lib/zoneinfo. From Klaus Ripke (klaus@ripke.com). + * + * 98.03.05 BJH. Version 2.2. + * + * Add --getepoch and --setepoch. + * + * Fix some word length things so it works on Alpha. + * + * Make it work when /dev/rtc doesn't have the interrupt functions. In this + * case, busywait for the top of a second instead of blocking and waiting + * for the update complete interrupt. + * + * Fix a bunch of bugs too numerous to mention. + * + * 97.06.01: BJH. Version 2.1. Read and write the century byte (Byte 50) of + * the ISA Hardware Clock when using direct ISA I/O. Problem discovered by + * job (jei@iclnl.icl.nl). + * + * Use the rtc clock access method in preference to the KDGHWCLK method. + * Problem discovered by Andreas Schwab <schwab@LS5.informatik.uni-dortmund.de>. + * + * November 1996: Version 2.0.1. Modifications by Nicolai Langfeldt + * (janl@math.uio.no) to make it compile on linux 1.2 machines as well as + * more recent versions of the kernel. Introduced the NO_CLOCK access method + * and wrote feature test code to detect absence of rtc headers. + * + *************************************************************************** + * Maintenance notes + * + * To compile this, you must use GNU compiler optimization (-O option) in + * order to make the "extern inline" functions from asm/io.h (inb(), etc.) + * compile. If you don't optimize, which means the compiler will generate no + * inline functions, the references to these functions in this program will + * be compiled as external references. Since you probably won't be linking + * with any functions by these names, you will have unresolved external + * references when you link. + * + * Here's some info on how we must deal with the time that elapses while + * this program runs: There are two major delays as we run: + * + * 1) Waiting up to 1 second for a transition of the Hardware Clock so + * we are synchronized to the Hardware Clock. + * 2) Running the "date" program to interpret the value of our --date + * option. + * + * Reading the /etc/adjtime file is the next biggest source of delay and + * uncertainty. + * + * The user wants to know what time it was at the moment he invoked us, not + * some arbitrary time later. And in setting the clock, he is giving us the + * time at the moment we are invoked, so if we set the clock some time + * later, we have to add some time to that. + * + * So we check the system time as soon as we start up, then run "date" and + * do file I/O if necessary, then wait to synchronize with a Hardware Clock + * edge, then check the system time again to see how much time we spent. We + * immediately read the clock then and (if appropriate) report that time, + * and additionally, the delay we measured. + * + * If we're setting the clock to a time given by the user, we wait some more + * so that the total delay is an integral number of seconds, then set the + * Hardware Clock to the time the user requested plus that integral number + * of seconds. N.B. The Hardware Clock can only be set in integral seconds. + * + * If we're setting the clock to the system clock value, we wait for the + * system clock to reach the top of a second, and then set the Hardware + * Clock to the system clock's value. + * + * Here's an interesting point about setting the Hardware Clock: On my + * machine, when you set it, it sets to that precise time. But one can + * imagine another clock whose update oscillator marches on a steady one + * second period, so updating the clock between any two oscillator ticks is + * the same as updating it right at the earlier tick. To avoid any + * complications that might cause, we set the clock as soon as possible + * after an oscillator tick. + * + * About synchronizing to the Hardware Clock when reading the time: The + * precision of the Hardware Clock counters themselves is one second. You + * can't read the counters and find out that is 12:01:02.5. But if you + * consider the location in time of the counter's ticks as part of its + * value, then its precision is as infinite as time is continuous! What I'm + * saying is this: To find out the _exact_ time in the hardware clock, we + * wait until the next clock tick (the next time the second counter changes) + * and measure how long we had to wait. We then read the value of the clock + * counters and subtract the wait time and we know precisely what time it + * was when we set out to query the time. + * + * hwclock uses this method, and considers the Hardware Clock to have + * infinite precision. + */ diff --git a/sys-utils/hwclock.h b/sys-utils/hwclock.h new file mode 100644 index 0000000..92fdb5f --- /dev/null +++ b/sys-utils/hwclock.h @@ -0,0 +1,80 @@ +#ifndef HWCLOCK_CLOCK_H +#define HWCLOCK_CLOCK_H + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <time.h> + +#include "c.h" +#include "debug.h" + +#define HWCLOCK_DEBUG_INIT (1 << 0) +#define HWCLOCK_DEBUG_RANDOM_SLEEP (1 << 1) +#define HWCLOCK_DEBUG_DELTA_VS_TARGET (1 << 2) +#define HWCLOCK_DEBUG_ALL 0xFFFF + +UL_DEBUG_DECLARE_MASK(hwclock); +#define DBG(m, x) __UL_DBG(hwclock, HWCLOCK_DEBUG_, m, x) +#define ON_DBG(m, x) __UL_DBG_CALL(hwclock, HWCLOCK_DEBUG_, m, x) + +struct hwclock_control { + char *date_opt; + char *adj_file_name; + double rtc_delay; /* --delay <seconds> */ +#if defined(__linux__) && defined(__alpha__) + char *epoch_option; +#endif +#ifdef __linux__ + char *rtc_dev_name; +#endif + unsigned int + hwaudit_on:1, + adjust:1, + show:1, + hctosys:1, + utc:1, + systohc:1, +#if defined(__linux__) && defined(__alpha__) + getepoch:1, + setepoch:1, +#endif + noadjfile:1, + local_opt:1, + directisa:1, + testing:1, + systz:1, + predict:1, + get:1, + set:1, + update:1, + universal:1, /* will store hw_clock_is_utc() return value */ + verbose:1; +}; + +struct clock_ops { + char *interface_name; + int (*get_permissions) (void); + int (*read_hardware_clock) (const struct hwclock_control *ctl, struct tm * tm); + int (*set_hardware_clock) (const struct hwclock_control *ctl, const struct tm * tm); + int (*synchronize_to_clock_tick) (const struct hwclock_control *ctl); + const char *(*get_device_path) (void); +}; + +extern struct clock_ops *probe_for_cmos_clock(void); +extern struct clock_ops *probe_for_rtc_clock(const struct hwclock_control *ctl); + +/* hwclock.c */ +extern double time_diff(struct timeval subtrahend, struct timeval subtractor); + +/* rtc.c */ +#if defined(__linux__) && defined(__alpha__) +extern int get_epoch_rtc(const struct hwclock_control *ctl, unsigned long *epoch); +extern int set_epoch_rtc(const struct hwclock_control *ctl); +#endif + +extern void __attribute__((__noreturn__)) +hwclock_exit(const struct hwclock_control *ctl, int status); + +#endif /* HWCLOCK_CLOCK_H */ diff --git a/sys-utils/ipcmk.1 b/sys-utils/ipcmk.1 new file mode 100644 index 0000000..e6ed434 --- /dev/null +++ b/sys-utils/ipcmk.1 @@ -0,0 +1,54 @@ +.\" Copyright 2008 Hayden A. James (hayden.james@gmail.com) +.\" May be distributed under the GNU General Public License +.TH IPCMK "1" "July 2014" "util-linux" "User Commands" +.SH "NAME" +ipcmk \- make various IPC resources +.SH "SYNOPSIS" +.B ipcmk +[options] +.SH "DESCRIPTION" +.B ipcmk +allows you to create shared memory segments, message queues, +and semaphore arrays. +.SH "OPTIONS" +.TP +Resources can be specified with these options: +.TP +.BR \-M , " \-\-shmem " \fIsize +Create a shared memory segment of +.I size +bytes. +The \fIsize\fR argument may be followed by the multiplicative suffixes KiB (=1024), MiB (=1024*1024), and so on for GiB, etc. (the +"iB" is optional, e.g., "K" has the same meaning as "KiB") or the suffixes KB (=1000), MB (=1000*1000), and so on for GB, etc. +.TP +.BR \-Q , " \-\-queue" +Create a message queue. +.TP +.BR \-S , " \-\-semaphore " \fInumber +Create a semaphore array with +.I number +of elements. +.PP +Other options are: +.TP +.BR \-p , " \-\-mode " \fImode +Access permissions for the resource. Default is 0644. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.PP +.SH "SEE ALSO" +.BR ipcrm (1), +.BR ipcs (1) +.SH "AUTHOR" +.MT hayden.james@gmail.com +Hayden A. James +.ME +.SH "AVAILABILITY" +The ipcmk command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/ipcmk.c b/sys-utils/ipcmk.c new file mode 100644 index 0000000..df83652 --- /dev/null +++ b/sys-utils/ipcmk.c @@ -0,0 +1,163 @@ +/* + * ipcmk.c - used to create ad-hoc IPC segments + * + * Copyright (C) 2008 Hayden A. James (hayden.james@gmail.com) + * Copyright (C) 2008 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <errno.h> +#include <getopt.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ipc.h> +#include <sys/msg.h> +#include <sys/sem.h> +#include <sys/shm.h> +#include <sys/time.h> + +#include "c.h" +#include "nls.h" +#include "randutils.h" +#include "strutils.h" +#include "closestream.h" + +static int create_shm(size_t size, int permission) +{ + key_t key; + + random_get_bytes(&key, sizeof(key)); + return shmget(key, size, permission | IPC_CREAT); +} + +static int create_msg(int permission) +{ + key_t key; + + random_get_bytes(&key, sizeof(key)); + return msgget(key, permission | IPC_CREAT); +} + +static int create_sem(int nsems, int permission) +{ + key_t key; + + random_get_bytes(&key, sizeof(key)); + return semget(key, nsems, permission | IPC_CREAT); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Create various IPC resources.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -M, --shmem <size> create shared memory segment of size <size>\n"), out); + fputs(_(" -S, --semaphore <number> create semaphore array with <number> elements\n"), out); + fputs(_(" -Q, --queue create message queue\n"), out); + fputs(_(" -p, --mode <mode> permission for the resource (default is 0644)\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(26)); + printf(USAGE_MAN_TAIL("ipcmk(1)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int permission = 0644; + int opt; + size_t size = 0; + int nsems = 0; + int ask_shm = 0, ask_msg = 0, ask_sem = 0; + static const struct option longopts[] = { + {"shmem", required_argument, NULL, 'M'}, + {"semaphore", required_argument, NULL, 'S'}, + {"queue", no_argument, NULL, 'Q'}, + {"mode", required_argument, NULL, 'p'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while((opt = getopt_long(argc, argv, "hM:QS:p:Vh", longopts, NULL)) != -1) { + switch(opt) { + case 'M': + size = strtosize_or_err(optarg, _("failed to parse size")); + ask_shm = 1; + break; + case 'Q': + ask_msg = 1; + break; + case 'S': + nsems = strtos32_or_err(optarg, _("failed to parse elements")); + ask_sem = 1; + break; + case 'p': + permission = strtoul(optarg, NULL, 8); + break; + case 'h': + usage(); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if(!ask_shm && !ask_msg && !ask_sem) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + if (ask_shm) { + int shmid; + if (-1 == (shmid = create_shm(size, permission))) + err(EXIT_FAILURE, _("create share memory failed")); + else + printf(_("Shared memory id: %d\n"), shmid); + } + + if (ask_msg) { + int msgid; + if (-1 == (msgid = create_msg(permission))) + err(EXIT_FAILURE, _("create message queue failed")); + else + printf(_("Message queue id: %d\n"), msgid); + } + + if (ask_sem) { + int semid; + if (-1 == (semid = create_sem(nsems, permission))) + err(EXIT_FAILURE, _("create semaphore failed")); + else + printf(_("Semaphore id: %d\n"), semid); + } + + return EXIT_SUCCESS; +} diff --git a/sys-utils/ipcrm.1 b/sys-utils/ipcrm.1 new file mode 100644 index 0000000..be73dff --- /dev/null +++ b/sys-utils/ipcrm.1 @@ -0,0 +1,117 @@ +.\" Copyright 2002 Andre C. Mazzone (linuxdev@karagee.com) +.\" May be distributed under the GNU General Public License +.TH IPCRM "1" "July 2014" "util-linux" "User Commands" +.SH NAME +ipcrm \- remove certain IPC resources +.SH SYNOPSIS +.B ipcrm +[options] +.sp +.B ipcrm +.RB { shm | msg | sem } +.IR id ... +.SH DESCRIPTION +.B ipcrm +removes System V inter-process communication (IPC) objects +and associated data structures from the system. +In order to delete such objects, you must be superuser, or +the creator or owner of the object. +.PP +System V IPC objects are of three types: shared memory, +message queues, and semaphores. +Deletion of a message queue or semaphore object is immediate +(regardless of whether any process still holds an IPC +identifier for the object). +A shared memory object is only removed +after all currently attached processes have detached +.RB ( shmdt (2)) +the object from their virtual address space. +.PP +Two syntax styles are supported. The old Linux historical syntax specifies +a three-letter keyword indicating which class of object is to be deleted, +followed by one or more IPC identifiers for objects of this type. +.PP +The SUS-compliant syntax allows the specification of +zero or more objects of all three types in a single command line, +with objects specified either by key or by identifier (see below). +Both keys and identifiers may be specified in decimal, hexadecimal +(specified with an initial '0x' or '0X'), or octal (specified with +an initial '0'). +.PP +The details of the removes are described in +.BR shmctl (2), +.BR msgctl (2), +and +.BR semctl (2). +The identifiers and keys can be found by using +.BR ipcs (1). +.SH OPTIONS +.TP +\fB-a\fR, \fB\-\-all\fR [\fBshm\fR] [\fBmsg\fR] [\fBsem\fR] +Remove all resources. When an option argument is provided, the removal is +performed only for the specified resource types. \fIWarning!\fR Do not use +.B \-a +if you are unsure how the software using the resources might react to missing +objects. Some programs create these resources at startup and may not have +any code to deal with an unexpected disappearance. +.TP +.BR \-M , " \-\-shmem\-key " \fIshmkey +Remove the shared memory segment created with +.I shmkey +after the last detach is performed. +.TP +.BR \-m , " \-\-shmem\-id " \fIshmid +Remove the shared memory segment identified by +.I shmid +after the last detach is performed. +.TP +.BR \-Q , " \-\-queue\-key " \fImsgkey +Remove the message queue created with +.IR msgkey . +.TP +.BR \-q , " \-\-queue\-id " \fImsgid +Remove the message queue identified by +.IR msgid . +.TP +.BR \-S , " \-\-semaphore\-key " \fIsemkey +Remove the semaphore created with +.IR semkey . +.TP +.BR \-s , " \-\-semaphore\-id " \fIsemid +Remove the semaphore identified by +.IR semid . +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH NOTES +In its first Linux implementation, \fBipcrm\fR used the deprecated syntax +shown in the second line of the +.BR SYNOPSIS . +Functionality present in other *nix implementations of \fBipcrm\fR has since +been added, namely the ability to delete resources by key (not just +identifier), and to respect the same command-line syntax. For backward +compatibility the previous syntax is still supported. +.\" .SH AUTHORS +.\" Andre C. Mazzone (linuxdev@karagee.com) +.\" .br +.\" Krishna Balasubramanian (balasub@cis.ohio-state.edu) +.SH SEE ALSO +.nh +.BR ipcmk (1), +.BR ipcs (1), +.BR msgctl (2), +.BR msgget (2), +.BR semctl (2), +.BR semget (2), +.BR shmctl (2), +.BR shmdt (2), +.BR shmget (2), +.BR ftok (3) +.SH AVAILABILITY +The ipcrm command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/ipcrm.c b/sys-utils/ipcrm.c new file mode 100644 index 0000000..a9f2d1b --- /dev/null +++ b/sys-utils/ipcrm.c @@ -0,0 +1,423 @@ +/* + * krishna balasubramanian 1993 + * + * 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + * + * 1999-04-02 frank zago + * - can now remove several id's in the same call + * + */ + +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ipc.h> +#include <sys/msg.h> +#include <sys/sem.h> +#include <sys/shm.h> +#include <sys/types.h> +#include "c.h" +#include "nls.h" +#include "strutils.h" +#include "closestream.h" + +#ifndef HAVE_UNION_SEMUN +/* according to X/OPEN we have to define it ourselves */ +union semun { + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; +}; +#endif + +typedef enum type_id { + SHM, + SEM, + MSG, + ALL +} type_id; + +static int verbose = 0; + +/* print the usage */ +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %1$s [options]\n" + " %1$s shm|msg|sem <id>...\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Remove certain IPC resources.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -m, --shmem-id <id> remove shared memory segment by id\n"), out); + fputs(_(" -M, --shmem-key <key> remove shared memory segment by key\n"), out); + fputs(_(" -q, --queue-id <id> remove message queue by id\n"), out); + fputs(_(" -Q, --queue-key <key> remove message queue by key\n"), out); + fputs(_(" -s, --semaphore-id <id> remove semaphore by id\n"), out); + fputs(_(" -S, --semaphore-key <key> remove semaphore by key\n"), out); + fputs(_(" -a, --all[=shm|msg|sem] remove all (in the specified category)\n"), out); + fputs(_(" -v, --verbose explain what is being done\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(28)); + printf(USAGE_MAN_TAIL("ipcrm(1)")); + + exit(EXIT_SUCCESS); +} + +static int remove_id(int type, int iskey, int id) +{ + int ret; + char *errmsg; + /* needed to delete semaphores */ + union semun arg; + arg.val = 0; + + /* do the removal */ + switch (type) { + case SHM: + if (verbose) + printf(_("removing shared memory segment id `%d'\n"), id); + ret = shmctl(id, IPC_RMID, NULL); + break; + case MSG: + if (verbose) + printf(_("removing message queue id `%d'\n"), id); + ret = msgctl(id, IPC_RMID, NULL); + break; + case SEM: + if (verbose) + printf(_("removing semaphore id `%d'\n"), id); + ret = semctl(id, 0, IPC_RMID, arg); + break; + default: + errx(EXIT_FAILURE, "impossible occurred"); + } + + /* how did the removal go? */ + if (ret < 0) { + switch (errno) { + case EACCES: + case EPERM: + errmsg = iskey ? _("permission denied for key") : _("permission denied for id"); + break; + case EINVAL: + errmsg = iskey ? _("invalid key") : _("invalid id"); + break; + case EIDRM: + errmsg = iskey ? _("already removed key") : _("already removed id"); + break; + default: + err(EXIT_FAILURE, "%s", iskey ? _("key failed") : _("id failed")); + } + warnx("%s (%d)", errmsg, id); + return 1; + } + return 0; +} + +static int remove_arg_list(type_id type, int argc, char **argv) +{ + int id; + char *end; + int nb_errors = 0; + + do { + id = strtoul(argv[0], &end, 10); + if (*end != 0) { + warnx(_("invalid id: %s"), argv[0]); + nb_errors++; + } else { + if (remove_id(type, 0, id)) + nb_errors++; + } + argc--; + argv++; + } while (argc); + return (nb_errors); +} + +static int deprecated_main(int argc, char **argv) +{ + type_id type; + + if (!strcmp(argv[1], "shm")) + type = SHM; + else if (!strcmp(argv[1], "msg")) + type = MSG; + else if (!strcmp(argv[1], "sem")) + type = SEM; + else + return 0; + + if (argc < 3) { + warnx(_("not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + + if (remove_arg_list(type, argc - 2, &argv[2])) + exit(EXIT_FAILURE); + + printf(_("resource(s) deleted\n")); + return 1; +} + +static unsigned long strtokey(const char *str, const char *errmesg) +{ + unsigned long num; + char *end = NULL; + + if (str == NULL || *str == '\0') + goto err; + errno = 0; + /* keys are in hex or decimal */ + num = strtoul(str, &end, 0); + + if (errno || str == end || (end && *end)) + goto err; + + return num; + err: + if (errno) + err(EXIT_FAILURE, "%s: '%s'", errmesg, str); + else + errx(EXIT_FAILURE, "%s: '%s'", errmesg, str); + return 0; +} + +static int key_to_id(type_id type, char *s) +{ + int id; + /* keys are in hex or decimal */ + key_t key = strtokey(s, "failed to parse argument"); + if (key == IPC_PRIVATE) { + warnx(_("illegal key (%s)"), s); + return -1; + } + switch (type) { + case SHM: + id = shmget(key, 0, 0); + break; + case MSG: + id = msgget(key, 0); + break; + case SEM: + id = semget(key, 0, 0); + break; + case ALL: + abort(); + default: + errx(EXIT_FAILURE, "impossible occurred"); + } + if (id < 0) { + char *errmsg; + switch (errno) { + case EACCES: + errmsg = _("permission denied for key"); + break; + case EIDRM: + errmsg = _("already removed key"); + break; + case ENOENT: + errmsg = _("invalid key"); + break; + default: + err(EXIT_FAILURE, _("key failed")); + } + warnx("%s (%s)", errmsg, s); + } + return id; +} + +static int remove_all(type_id type) +{ + int ret = 0; + int id, rm_me, maxid; + + struct shmid_ds shmseg; + + struct semid_ds semary; + struct seminfo seminfo; + union semun arg; + + struct msqid_ds msgque; + struct msginfo msginfo; + + if (type == SHM || type == ALL) { + maxid = shmctl(0, SHM_INFO, &shmseg); + if (maxid < 0) + errx(EXIT_FAILURE, + _("kernel not configured for shared memory")); + for (id = 0; id <= maxid; id++) { + rm_me = shmctl(id, SHM_STAT, &shmseg); + if (rm_me < 0) + continue; + ret |= remove_id(SHM, 0, rm_me); + } + } + if (type == SEM || type == ALL) { + arg.array = (ushort *) (void *)&seminfo; + maxid = semctl(0, 0, SEM_INFO, arg); + if (maxid < 0) + errx(EXIT_FAILURE, + _("kernel not configured for semaphores")); + for (id = 0; id <= maxid; id++) { + arg.buf = (struct semid_ds *)&semary; + rm_me = semctl(id, 0, SEM_STAT, arg); + if (rm_me < 0) + continue; + ret |= remove_id(SEM, 0, rm_me); + } + } +/* kFreeBSD hackery -- ah 20140723 */ +#ifndef MSG_STAT +#define MSG_STAT 11 +#endif +#ifndef MSG_INFO +#define MSG_INFO 12 +#endif + if (type == MSG || type == ALL) { + maxid = + msgctl(0, MSG_INFO, (struct msqid_ds *)(void *)&msginfo); + if (maxid < 0) + errx(EXIT_FAILURE, + _("kernel not configured for message queues")); + for (id = 0; id <= maxid; id++) { + rm_me = msgctl(id, MSG_STAT, &msgque); + if (rm_me < 0) + continue; + ret |= remove_id(MSG, 0, rm_me); + } + } + return ret; +} + +int main(int argc, char **argv) +{ + int c; + int ret = 0; + int id = -1; + int iskey; + int rm_all = 0; + type_id what_all = ALL; + + static const struct option longopts[] = { + {"shmem-id", required_argument, NULL, 'm'}, + {"shmem-key", required_argument, NULL, 'M'}, + {"queue-id", required_argument, NULL, 'q'}, + {"queue-key", required_argument, NULL, 'Q'}, + {"semaphore-id", required_argument, NULL, 's'}, + {"semaphore-key", required_argument, NULL, 'S'}, + {"all", optional_argument, NULL, 'a'}, + {"verbose", no_argument, NULL, 'v'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + /* if the command is executed without parameters, do nothing */ + if (argc == 1) + return 0; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + /* check to see if the command is being invoked in the old way if so + * then remove argument list */ + if (deprecated_main(argc, argv)) + return EXIT_SUCCESS; + + /* process new syntax to conform with SYSV ipcrm */ + while((c = getopt_long(argc, argv, "q:m:s:Q:M:S:a::vhV", longopts, NULL)) != -1) { + iskey = 0; + switch (c) { + case 'M': + iskey = 1; + id = key_to_id(SHM, optarg); + if (id < 0) { + ret++; + break; + } + /* fallthrough */ + case 'm': + if (!iskey) + id = strtos32_or_err(optarg, _("failed to parse argument")); + if (remove_id(SHM, iskey, id)) + ret++; + break; + case 'Q': + iskey = 1; + id = key_to_id(MSG, optarg); + if (id < 0) { + ret++; + break; + } + /* fallthrough */ + case 'q': + if (!iskey) + id = strtos32_or_err(optarg, _("failed to parse argument")); + if (remove_id(MSG, iskey, id)) + ret++; + break; + case 'S': + iskey = 1; + id = key_to_id(SEM, optarg); + if (id < 0) { + ret++; + break; + } + /* fallthrough */ + case 's': + if (!iskey) + id = strtos32_or_err(optarg, _("failed to parse argument")); + if (remove_id(SEM, iskey, id)) + ret++; + break; + case 'a': + rm_all = 1; + if (optarg) { + if (!strcmp(optarg, "shm")) + what_all = SHM; + else if (!strcmp(optarg, "msg")) + what_all = MSG; + else if (!strcmp(optarg, "sem")) + what_all = SEM; + else + errx(EXIT_FAILURE, + _("unknown argument: %s"), optarg); + } else { + what_all = ALL; + } + break; + case 'v': + verbose = 1; + break; + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (rm_all && remove_all(what_all)) + ret++; + + /* print usage if we still have some arguments left over */ + if (optind < argc) { + warnx(_("unknown argument: %s"), argv[optind]); + errtryhelp(EXIT_FAILURE); + } + + return ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/sys-utils/ipcs.1 b/sys-utils/ipcs.1 new file mode 100644 index 0000000..93c35e3 --- /dev/null +++ b/sys-utils/ipcs.1 @@ -0,0 +1,116 @@ +.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu) +.\" May be distributed under the GNU General Public License +.TH IPCS "1" "July 2014" "util-linux" "User Commands" +.SH NAME +ipcs \- show information on IPC facilities +.SH SYNOPSIS +.B ipcs +[options] +.SH DESCRIPTION +.B ipcs +shows information on the inter-process communication facilities +for which the calling process has read access. +By default it shows information about all three resources: +shared memory segments, message queues, and semaphore arrays. +.SH OPTIONS +.TP +\fB\-i\fR, \fB\-\-id\fR \fIid\fR +Show full details on just the one resource element identified by +.IR id . +This option needs to be combined with one of the three resource options: +.BR \-m , +.BR \-q " or" +.BR \-s . +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.SS "Resource options" +.TP +\fB\-m\fR, \fB\-\-shmems\fR +Write information about active shared memory segments. +.TP +\fB\-q\fR, \fB\-\-queues\fR +Write information about active message queues. +.TP +\fB\-s\fR, \fB\-\-semaphores\fR +Write information about active semaphore sets. +.TP +\fB\-a\fR, \fB\-\-all\fR +Write information about all three resources (default). +.SS "Output formats" +Of these options only one takes effect: the last one specified. +.TP +\fB\-c\fR, \fB\-\-creator\fR +Show creator and owner. +.TP +\fB\-l\fR, \fB\-\-limits\fR +Show resource limits. +.TP +\fB\-p\fR, \fB\-\-pid\fR +Show PIDs of creator and last operator. +.TP +\fB\-t\fR, \fB\-\-time\fR +Write time information. The time of the last control operation that changed +the access permissions for all facilities, the time of the last +.BR msgsnd (2) +and +.BR msgrcv (2) +operations on message queues, the time of the last +.BR shmat (2) +and +.BR shmdt (2) +operations on shared memory, and the time of the last +.BR semop (2) +operation on semaphores. +.TP +\fB\-u\fR, \fB\-\-summary\fR +Show status summary. +.SS "Representation" +These affect only the \fB\-l\fR (\fB\-\-limits\fR) option. +.TP +\fB\-b\fR, \fB\-\-bytes\fR +Print sizes in bytes. +.TP +.B \-\-human +Print sizes in human-readable format. +.SH SEE ALSO +.BR ipcmk (1), +.BR ipcrm (1), +.BR msgrcv (2), +.BR msgsnd (2), +.BR semget (2), +.BR semop (2), +.BR shmat (2), +.BR shmdt (2), +.BR shmget (2) +.SH CONFORMING TO +The Linux ipcs utility is not fully compatible to the POSIX ipcs utility. +The Linux version does not support the POSIX +.BR \-a , +.B \-b +and +.B \-o +options, but does support the +.B \-l +and +.B \-u +options not defined by POSIX. A portable application shall not use the +.BR \-a , +.BR \-b , +.BR \-o , +.BR \-l , +and +.B \-u +options. +.SH AUTHOR +.UR balasub@cis.ohio-state.edu +Krishna Balasubramanian +.UE +.SH AVAILABILITY +The ipcs command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/ipcs.c b/sys-utils/ipcs.c new file mode 100644 index 0000000..73cf28a --- /dev/null +++ b/sys-utils/ipcs.c @@ -0,0 +1,668 @@ +/* Original author unknown, may be "krishna balasub@cis.ohio-state.edu" */ +/* + * Modified Sat Oct 9 10:55:28 1993 for 0.99.13 + * + * Patches from Mike Jagdis (jaggy@purplet.demon.co.uk) applied Wed Feb 8 + * 12:12:21 1995 by faith@cs.unc.edu to print numeric uids if no passwd file + * entry. + * + * Patch from arnolds@ifns.de (Heinz-Ado Arnolds) applied Mon Jul 1 19:30:41 + * 1996 by janl@math.uio.no to add code missing in case PID: clauses. + * + * Patched to display the key field -- hy@picksys.com 12/18/96 + * + * 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + */ + +#include <errno.h> +#include <getopt.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" + +#include "ipcutils.h" + +enum output_formats { + NOTSPECIFIED, + LIMITS, + STATUS, + CREATOR, + TIME, + PID +}; +enum { + OPT_HUMAN = CHAR_MAX + 1 +}; + +static void do_shm (char format, int unit); +static void print_shm (int id, int unit); +static void do_sem (char format); +static void print_sem (int id); +static void do_msg (char format, int unit); +static void print_msg (int id, int unit); + +/* we read time as int64_t from /proc, so cast... */ +#define xctime(_x) ctime((time_t *) (_x)) + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %1$s [resource-option...] [output-option]\n" + " %1$s -m|-q|-s -i <id>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Show information on IPC facilities.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -i, --id <id> print details on resource identified by <id>\n"), out); + printf(USAGE_HELP_OPTIONS(16)); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Resource options:\n"), out); + fputs(_(" -m, --shmems shared memory segments\n"), out); + fputs(_(" -q, --queues message queues\n"), out); + fputs(_(" -s, --semaphores semaphores\n"), out); + fputs(_(" -a, --all all (default)\n"), out); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Output options:\n"), out); + fputs(_(" -t, --time show attach, detach and change times\n"), out); + fputs(_(" -p, --pid show PIDs of creator and last operator\n"), out); + fputs(_(" -c, --creator show creator and owner\n"), out); + fputs(_(" -l, --limits show resource limits\n"), out); + fputs(_(" -u, --summary show status summary\n"), out); + fputs(_(" --human show sizes in human-readable format\n"), out); + fputs(_(" -b, --bytes show sizes in bytes\n"), out); + printf(USAGE_MAN_TAIL("ipcs(1)")); + + exit(EXIT_SUCCESS); +} + +int main (int argc, char **argv) +{ + int opt, msg = 0, shm = 0, sem = 0, id = 0, specific = 0; + char format = NOTSPECIFIED; + int unit = IPC_UNIT_DEFAULT; + static const struct option longopts[] = { + {"id", required_argument, NULL, 'i'}, + {"queues", no_argument, NULL, 'q'}, + {"shmems", no_argument, NULL, 'm'}, + {"semaphores", no_argument, NULL, 's'}, + {"all", no_argument, NULL, 'a'}, + {"time", no_argument, NULL, 't'}, + {"pid", no_argument, NULL, 'p'}, + {"creator", no_argument, NULL, 'c'}, + {"limits", no_argument, NULL, 'l'}, + {"summary", no_argument, NULL, 'u'}, + {"human", no_argument, NULL, OPT_HUMAN}, + {"bytes", no_argument, NULL, 'b'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + char options[] = "i:qmsatpclubVh"; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((opt = getopt_long(argc, argv, options, longopts, NULL)) != -1) { + switch (opt) { + case 'i': + id = atoi (optarg); + specific = 1; + break; + case 'a': + msg = shm = sem = 1; + break; + case 'q': + msg = 1; + break; + case 'm': + shm = 1; + break; + case 's': + sem = 1; + break; + case 't': + format = TIME; + break; + case 'c': + format = CREATOR; + break; + case 'p': + format = PID; + break; + case 'l': + format = LIMITS; + break; + case 'u': + format = STATUS; + break; + case OPT_HUMAN: + unit = IPC_UNIT_HUMAN; + break; + case 'b': + unit = IPC_UNIT_BYTES; + break; + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (specific && (msg + shm + sem != 1)) + errx (EXIT_FAILURE, + _("when using an ID, a single resource must be specified")); + if (specific) { + if (msg) + print_msg (id, unit); + if (shm) + print_shm (id, unit); + if (sem) + print_sem (id); + } else { + if (!msg && !shm && !sem) + msg = shm = sem = 1; + printf ("\n"); + if (msg) { + do_msg (format, unit); + printf ("\n"); + } + if (shm) { + do_shm (format, unit); + printf ("\n"); + } + if (sem) { + do_sem (format); + printf ("\n"); + } + } + return EXIT_SUCCESS; +} + +static void do_shm (char format, int unit) +{ + struct passwd *pw; + struct shm_data *shmds, *shmdsp; + + switch (format) { + case LIMITS: + { + struct ipc_limits lim; + uint64_t tmp, pgsz = getpagesize(); + + if (ipc_shm_get_limits(&lim)) { + printf (_("unable to fetch shared memory limits\n")); + return; + } + printf (_("------ Shared Memory Limits --------\n")); + printf (_("max number of segments = %ju\n"), lim.shmmni); + ipc_print_size(unit == IPC_UNIT_DEFAULT ? IPC_UNIT_KB : unit, + _("max seg size"), lim.shmmax, "\n", 0); + + tmp = (uint64_t) lim.shmall * pgsz; + /* overflow handling, at least we don't print ridiculous small values */ + if (lim.shmall != 0 && tmp / lim.shmall != pgsz) { + tmp = UINT64_MAX - (UINT64_MAX % pgsz); + } + ipc_print_size(unit == IPC_UNIT_DEFAULT ? IPC_UNIT_KB : unit, + _("max total shared memory"), tmp, "\n", 0); + ipc_print_size(unit == IPC_UNIT_DEFAULT ? IPC_UNIT_BYTES : unit, + _("min seg size"), lim.shmmin, "\n", 0); + return; + } + case STATUS: + { + int maxid; + struct shmid_ds shmbuf; + struct shm_info *shm_info; + + maxid = shmctl (0, SHM_INFO, &shmbuf); + shm_info = (struct shm_info *) &shmbuf; + if (maxid < 0) { + printf (_("kernel not configured for shared memory\n")); + return; + } + + printf (_("------ Shared Memory Status --------\n")); + /* + * TRANSLATORS: This output format is maintained for backward + * compatibility as ipcs is used in scripts. For consistency + * with the rest, the translated form can follow this model: + * + * "segments allocated = %d\n" + * "pages allocated = %ld\n" + * "pages resident = %ld\n" + * "pages swapped = %ld\n" + * "swap performance = %ld attempts, %ld successes\n" + */ + printf (_("segments allocated %d\n" + "pages allocated %ld\n" + "pages resident %ld\n" + "pages swapped %ld\n" + "Swap performance: %ld attempts\t %ld successes\n"), + shm_info->used_ids, + shm_info->shm_tot, + shm_info->shm_rss, + shm_info->shm_swp, + shm_info->swap_attempts, shm_info->swap_successes); + return; + } + + /* + * Headers only + */ + case CREATOR: + printf (_("------ Shared Memory Segment Creators/Owners --------\n")); + printf ("%-10s %-10s %-10s %-10s %-10s %-10s\n", + _("shmid"),_("perms"),_("cuid"),_("cgid"),_("uid"),_("gid")); + break; + + case TIME: + printf (_("------ Shared Memory Attach/Detach/Change Times --------\n")); + printf ("%-10s %-10s %-20s %-20s %-20s\n", + _("shmid"),_("owner"),_("attached"),_("detached"), + _("changed")); + break; + + case PID: + printf (_("------ Shared Memory Creator/Last-op PIDs --------\n")); + printf ("%-10s %-10s %-10s %-10s\n", + _("shmid"),_("owner"),_("cpid"),_("lpid")); + break; + + default: + printf (_("------ Shared Memory Segments --------\n")); + printf ("%-10s %-10s %-10s %-10s %-10s %-10s %-12s\n", + _("key"),_("shmid"),_("owner"),_("perms"), + unit == IPC_UNIT_HUMAN ? _("size") : _("bytes"), + _("nattch"),_("status")); + break; + } + + /* + * Print data + */ + if (ipc_shm_get_info(-1, &shmds) < 1) + return; + + for (shmdsp = shmds; shmdsp->next != NULL; shmdsp = shmdsp->next) { + if (format == CREATOR) { + ipc_print_perms(stdout, &shmdsp->shm_perm); + continue; + } + pw = getpwuid(shmdsp->shm_perm.uid); + switch (format) { + case TIME: + if (pw) + printf ("%-10d %-10.10s", shmdsp->shm_perm.id, pw->pw_name); + else + printf ("%-10d %-10u", shmdsp->shm_perm.id, shmdsp->shm_perm.uid); + /* ctime uses static buffer: use separate calls */ + printf(" %-20.16s", shmdsp->shm_atim + ? xctime(&shmdsp->shm_atim) + 4 : _("Not set")); + printf(" %-20.16s", shmdsp->shm_dtim + ? xctime(&shmdsp->shm_dtim) + 4 : _("Not set")); + printf(" %-20.16s\n", shmdsp->shm_ctim + ? xctime(&shmdsp->shm_ctim) + 4 : _("Not set")); + break; + case PID: + if (pw) + printf ("%-10d %-10.10s", shmdsp->shm_perm.id, pw->pw_name); + else + printf ("%-10d %-10u", shmdsp->shm_perm.id, shmdsp->shm_perm.uid); + printf (" %-10u %-10u\n", + shmdsp->shm_cprid, shmdsp->shm_lprid); + break; + + default: + printf("0x%08x ", shmdsp->shm_perm.key); + if (pw) + printf ("%-10d %-10.10s", shmdsp->shm_perm.id, pw->pw_name); + else + printf ("%-10d %-10u", shmdsp->shm_perm.id, shmdsp->shm_perm.uid); + printf (" %-10o ", shmdsp->shm_perm.mode & 0777); + + if (unit == IPC_UNIT_HUMAN) + ipc_print_size(unit, NULL, shmdsp->shm_segsz, " ", 6); + else + ipc_print_size(unit, NULL, shmdsp->shm_segsz, NULL, -10); + + printf (" %-10ju %-6s %-6s\n", + shmdsp->shm_nattch, + shmdsp->shm_perm.mode & SHM_DEST ? _("dest") : " ", + shmdsp->shm_perm.mode & SHM_LOCKED ? _("locked") : " "); + break; + } + } + + ipc_shm_free_info(shmds); + return; +} + +static void do_sem (char format) +{ + struct passwd *pw; + struct sem_data *semds, *semdsp; + + switch (format) { + case LIMITS: + { + struct ipc_limits lim; + + if (ipc_sem_get_limits(&lim)) { + printf (_("unable to fetch semaphore limits\n")); + return; + } + printf (_("------ Semaphore Limits --------\n")); + printf (_("max number of arrays = %d\n"), lim.semmni); + printf (_("max semaphores per array = %d\n"), lim.semmsl); + printf (_("max semaphores system wide = %d\n"), lim.semmns); + printf (_("max ops per semop call = %d\n"), lim.semopm); + printf (_("semaphore max value = %u\n"), lim.semvmx); + return; + } + case STATUS: + { + struct seminfo seminfo; + union semun arg; + arg.array = (ushort *) (void *) &seminfo; + if (semctl (0, 0, SEM_INFO, arg) < 0) { + printf (_("kernel not configured for semaphores\n")); + return; + } + printf (_("------ Semaphore Status --------\n")); + printf (_("used arrays = %d\n"), seminfo.semusz); + printf (_("allocated semaphores = %d\n"), seminfo.semaem); + return; + } + + case CREATOR: + printf (_("------ Semaphore Arrays Creators/Owners --------\n")); + printf ("%-10s %-10s %-10s %-10s %-10s %-10s\n", + _("semid"),_("perms"),_("cuid"),_("cgid"),_("uid"),_("gid")); + break; + + case TIME: + printf (_("------ Semaphore Operation/Change Times --------\n")); + printf ("%-8s %-10s %-26.24s %-26.24s\n", + _("semid"),_("owner"),_("last-op"),_("last-changed")); + break; + + case PID: + break; + + default: + printf (_("------ Semaphore Arrays --------\n")); + printf ("%-10s %-10s %-10s %-10s %-10s\n", + _("key"),_("semid"),_("owner"),_("perms"),_("nsems")); + break; + } + + /* + * Print data + */ + if (ipc_sem_get_info(-1, &semds) < 1) + return; + + for (semdsp = semds; semdsp->next != NULL; semdsp = semdsp->next) { + if (format == CREATOR) { + ipc_print_perms(stdout, &semdsp->sem_perm); + continue; + } + pw = getpwuid(semdsp->sem_perm.uid); + switch (format) { + case TIME: + if (pw) + printf ("%-8d %-10.10s", semdsp->sem_perm.id, pw->pw_name); + else + printf ("%-8d %-10u", semdsp->sem_perm.id, semdsp->sem_perm.uid); + printf (" %-26.24s", semdsp->sem_otime + ? xctime(&semdsp->sem_otime) : _("Not set")); + printf (" %-26.24s\n", semdsp->sem_ctime + ? xctime( &semdsp->sem_ctime) : _("Not set")); + break; + case PID: + break; + + default: + printf("0x%08x ", semdsp->sem_perm.key); + if (pw) + printf ("%-10d %-10.10s", semdsp->sem_perm.id, pw->pw_name); + else + printf ("%-10d %-10u", semdsp->sem_perm.id, semdsp->sem_perm.uid); + printf (" %-10o %-10ju\n", + semdsp->sem_perm.mode & 0777, + semdsp->sem_nsems); + break; + } + } + + ipc_sem_free_info(semds); + return; +} + +static void do_msg (char format, int unit) +{ + struct passwd *pw; + struct msg_data *msgds, *msgdsp; + + switch (format) { + case LIMITS: + { + struct ipc_limits lim; + + if (ipc_msg_get_limits(&lim)) { + printf (_("unable to fetch message limits\n")); + return; + } + printf (_("------ Messages Limits --------\n")); + printf (_("max queues system wide = %d\n"), lim.msgmni); + ipc_print_size(unit == IPC_UNIT_DEFAULT ? IPC_UNIT_BYTES : unit, + _("max size of message"), lim.msgmax, "\n", 0); + ipc_print_size(unit == IPC_UNIT_DEFAULT ? IPC_UNIT_BYTES : unit, + _("default max size of queue"), lim.msgmnb, "\n", 0); + return; + } + case STATUS: + { + struct msginfo msginfo; + if (msgctl (0, MSG_INFO, (struct msqid_ds *) (void *) &msginfo) < 0) { + printf (_("kernel not configured for message queues\n")); + return; + } + printf (_("------ Messages Status --------\n")); +#ifndef __FreeBSD_kernel__ + printf (_("allocated queues = %d\n"), msginfo.msgpool); + printf (_("used headers = %d\n"), msginfo.msgmap); +#endif + ipc_print_size(unit, _("used space"), msginfo.msgtql, + unit == IPC_UNIT_DEFAULT ? _(" bytes\n") : "\n", 0); + return; + } + case CREATOR: + printf (_("------ Message Queues Creators/Owners --------\n")); + printf ("%-10s %-10s %-10s %-10s %-10s %-10s\n", + _("msqid"),_("perms"),_("cuid"),_("cgid"),_("uid"),_("gid")); + break; + + case TIME: + printf (_("------ Message Queues Send/Recv/Change Times --------\n")); + printf ("%-8s %-10s %-20s %-20s %-20s\n", + _("msqid"),_("owner"),_("send"),_("recv"),_("change")); + break; + + case PID: + printf (_("------ Message Queues PIDs --------\n")); + printf ("%-10s %-10s %-10s %-10s\n", + _("msqid"),_("owner"),_("lspid"),_("lrpid")); + break; + + default: + printf (_("------ Message Queues --------\n")); + printf ("%-10s %-10s %-10s %-10s %-12s %-12s\n", + _("key"), _("msqid"), _("owner"), _("perms"), + unit == IPC_UNIT_HUMAN ? _("size") : _("used-bytes"), + _("messages")); + break; + } + + /* + * Print data + */ + if (ipc_msg_get_info(-1, &msgds) < 1) + return; + + for (msgdsp = msgds; msgdsp->next != NULL; msgdsp = msgdsp->next) { + if (format == CREATOR) { + ipc_print_perms(stdout, &msgdsp->msg_perm); + continue; + } + pw = getpwuid(msgdsp->msg_perm.uid); + switch (format) { + case TIME: + if (pw) + printf ("%-8d %-10.10s", msgdsp->msg_perm.id, pw->pw_name); + else + printf ("%-8d %-10u", msgdsp->msg_perm.id, msgdsp->msg_perm.uid); + printf (" %-20.16s", msgdsp->q_stime + ? xctime(&msgdsp->q_stime) + 4 : _("Not set")); + printf (" %-20.16s", msgdsp->q_rtime + ? xctime(&msgdsp->q_rtime) + 4 : _("Not set")); + printf (" %-20.16s\n", msgdsp->q_ctime + ? xctime(&msgdsp->q_ctime) + 4 : _("Not set")); + break; + case PID: + if (pw) + printf ("%-8d %-10.10s", msgdsp->msg_perm.id, pw->pw_name); + else + printf ("%-8d %-10u", msgdsp->msg_perm.id, msgdsp->msg_perm.uid); + printf (" %5d %5d\n", + msgdsp->q_lspid, msgdsp->q_lrpid); + break; + + default: + printf( "0x%08x ",msgdsp->msg_perm.key ); + if (pw) + printf ("%-10d %-10.10s", msgdsp->msg_perm.id, pw->pw_name); + else + printf ("%-10d %-10u", msgdsp->msg_perm.id, msgdsp->msg_perm.uid); + printf (" %-10o ", msgdsp->msg_perm.mode & 0777); + + if (unit == IPC_UNIT_HUMAN) + ipc_print_size(unit, NULL, msgdsp->q_cbytes, " ", 6); + else + ipc_print_size(unit, NULL, msgdsp->q_cbytes, NULL, -12); + + printf (" %-12ju\n", msgdsp->q_qnum); + break; + } + } + + ipc_msg_free_info(msgds); + return; +} + +static void print_shm(int shmid, int unit) +{ + struct shm_data *shmdata; + + if (ipc_shm_get_info(shmid, &shmdata) < 1) { + warnx(_("id %d not found"), shmid); + return; + } + + printf(_("\nShared memory Segment shmid=%d\n"), shmid); + printf(_("uid=%u\tgid=%u\tcuid=%u\tcgid=%u\n"), + shmdata->shm_perm.uid, shmdata->shm_perm.gid, + shmdata->shm_perm.cuid, shmdata->shm_perm.cgid); + printf(_("mode=%#o\taccess_perms=%#o\n"), shmdata->shm_perm.mode, + shmdata->shm_perm.mode & 0777); + ipc_print_size(unit, unit == IPC_UNIT_HUMAN ? _("size=") : _("bytes="), + shmdata->shm_segsz, "\t", 0); + printf(_("lpid=%u\tcpid=%u\tnattch=%jd\n"), + shmdata->shm_lprid, shmdata->shm_cprid, + shmdata->shm_nattch); + printf(_("att_time=%-26.24s\n"), + shmdata->shm_atim ? xctime(&(shmdata->shm_atim)) : _("Not set")); + printf(_("det_time=%-26.24s\n"), + shmdata->shm_dtim ? xctime(&shmdata->shm_dtim) : _("Not set")); + printf(_("change_time=%-26.24s\n"), xctime(&shmdata->shm_ctim)); + printf("\n"); + + ipc_shm_free_info(shmdata); +} + +static void print_msg(int msgid, int unit) +{ + struct msg_data *msgdata; + + if (ipc_msg_get_info(msgid, &msgdata) < 1) { + warnx(_("id %d not found"), msgid); + return; + } + + printf(_("\nMessage Queue msqid=%d\n"), msgid); + printf(_("uid=%u\tgid=%u\tcuid=%u\tcgid=%u\tmode=%#o\n"), + msgdata->msg_perm.uid, msgdata->msg_perm.gid, + msgdata->msg_perm.cuid, msgdata->msg_perm.cgid, + msgdata->msg_perm.mode); + ipc_print_size(unit, unit == IPC_UNIT_HUMAN ? _("csize=") : _("cbytes="), + msgdata->q_cbytes, "\t", 0); + ipc_print_size(unit, unit == IPC_UNIT_HUMAN ? _("qsize=") : _("qbytes="), + msgdata->q_qbytes, "\t", 0); + printf("qnum=%jd\tlspid=%d\tlrpid=%d\n", + msgdata->q_qnum, + msgdata->q_lspid, msgdata->q_lrpid); + printf(_("send_time=%-26.24s\n"), + msgdata->q_stime ? xctime(&msgdata->q_stime) : _("Not set")); + printf(_("rcv_time=%-26.24s\n"), + msgdata->q_rtime ? xctime(&msgdata->q_rtime) : _("Not set")); + printf(_("change_time=%-26.24s\n"), + msgdata->q_ctime ? xctime(&msgdata->q_ctime) : _("Not set")); + printf("\n"); + + ipc_msg_free_info(msgdata); +} + +static void print_sem(int semid) +{ + struct sem_data *semdata; + size_t i; + + if (ipc_sem_get_info(semid, &semdata) < 1) { + warnx(_("id %d not found"), semid); + return; + } + + printf(_("\nSemaphore Array semid=%d\n"), semid); + printf(_("uid=%u\t gid=%u\t cuid=%u\t cgid=%u\n"), + semdata->sem_perm.uid, semdata->sem_perm.gid, + semdata->sem_perm.cuid, semdata->sem_perm.cgid); + printf(_("mode=%#o, access_perms=%#o\n"), + semdata->sem_perm.mode, semdata->sem_perm.mode & 0777); + printf(_("nsems = %ju\n"), semdata->sem_nsems); + printf(_("otime = %-26.24s\n"), + semdata->sem_otime ? xctime(&semdata->sem_otime) : _("Not set")); + printf(_("ctime = %-26.24s\n"), xctime(&semdata->sem_ctime)); + + printf("%-10s %-10s %-10s %-10s %-10s\n", + _("semnum"), _("value"), _("ncount"), _("zcount"), _("pid")); + + for (i = 0; i < semdata->sem_nsems; i++) { + struct sem_elem *e = &semdata->elements[i]; + printf("%-10zu %-10d %-10d %-10d %-10d\n", + i, e->semval, e->ncount, e->zcount, e->pid); + } + printf("\n"); + ipc_sem_free_info(semdata); +} diff --git a/sys-utils/ipcutils.c b/sys-utils/ipcutils.c new file mode 100644 index 0000000..5fe297f --- /dev/null +++ b/sys-utils/ipcutils.c @@ -0,0 +1,533 @@ +#include <inttypes.h> + +#include "c.h" +#include "nls.h" +#include "xalloc.h" +#include "path.h" +#include "pathnames.h" +#include "ipcutils.h" +#include "strutils.h" + +#ifndef SEMVMX +# define SEMVMX 32767 /* <= 32767 semaphore maximum value */ +#endif +#ifndef SHMMIN +# define SHMMIN 1 /* min shared segment size in bytes */ +#endif + + +int ipc_msg_get_limits(struct ipc_limits *lim) +{ + if (access(_PATH_PROC_IPC_MSGMNI, F_OK) == 0 && + access(_PATH_PROC_IPC_MSGMNB, F_OK) == 0 && + access(_PATH_PROC_IPC_MSGMAX, F_OK) == 0) { + + ul_path_read_s32(NULL, &lim->msgmni, _PATH_PROC_IPC_MSGMNI); + ul_path_read_s32(NULL, &lim->msgmnb, _PATH_PROC_IPC_MSGMNB); + ul_path_read_u64(NULL, &lim->msgmax, _PATH_PROC_IPC_MSGMAX); + } else { + struct msginfo msginfo; + + if (msgctl(0, IPC_INFO, (struct msqid_ds *) &msginfo) < 0) + return 1; + lim->msgmni = msginfo.msgmni; + lim->msgmnb = msginfo.msgmnb; + lim->msgmax = msginfo.msgmax; + } + + return 0; +} + +int ipc_sem_get_limits(struct ipc_limits *lim) +{ + FILE *f; + int rc = 0; + + lim->semvmx = SEMVMX; + + f = fopen(_PATH_PROC_IPC_SEM, "r"); + if (f) { + rc = fscanf(f, "%d\t%d\t%d\t%d", + &lim->semmsl, &lim->semmns, &lim->semopm, &lim->semmni); + fclose(f); + } + + if (rc != 4) { + struct seminfo seminfo = { .semmni = 0 }; + union semun arg = { .array = (ushort *) &seminfo }; + + if (semctl(0, 0, IPC_INFO, arg) < 0) + return 1; + lim->semmni = seminfo.semmni; + lim->semmsl = seminfo.semmsl; + lim->semmns = seminfo.semmns; + lim->semopm = seminfo.semopm; + } + + return 0; +} + +int ipc_shm_get_limits(struct ipc_limits *lim) +{ + lim->shmmin = SHMMIN; + + if (access(_PATH_PROC_IPC_SHMALL, F_OK) == 0 && + access(_PATH_PROC_IPC_SHMMAX, F_OK) == 0 && + access(_PATH_PROC_IPC_SHMMNI, F_OK) == 0) { + + ul_path_read_u64(NULL, &lim->shmall, _PATH_PROC_IPC_SHMALL); + ul_path_read_u64(NULL, &lim->shmmax, _PATH_PROC_IPC_SHMMAX); + ul_path_read_u64(NULL, &lim->shmmni, _PATH_PROC_IPC_SHMMNI); + + } else { + struct shminfo *shminfo; + struct shmid_ds shmbuf; + + if (shmctl(0, IPC_INFO, &shmbuf) < 0) + return 1; + shminfo = (struct shminfo *) &shmbuf; + lim->shmmni = shminfo->shmmni; + lim->shmall = shminfo->shmall; + lim->shmmax = shminfo->shmmax; + } + + return 0; +} + +int ipc_shm_get_info(int id, struct shm_data **shmds) +{ + FILE *f; + int i = 0, maxid; + char buf[BUFSIZ]; + struct shm_data *p; + struct shmid_ds dummy; + + p = *shmds = xcalloc(1, sizeof(struct shm_data)); + p->next = NULL; + + f = fopen(_PATH_PROC_SYSV_SHM, "r"); + if (!f) + goto shm_fallback; + + while (fgetc(f) != '\n'); /* skip header */ + + while (fgets(buf, sizeof(buf), f) != NULL) { + /* scan for the first 14-16 columns (e.g. Linux 2.6.32 has 14) */ + p->shm_rss = 0xdead; + p->shm_swp = 0xdead; + if (sscanf(buf, + "%d %d %o %"SCNu64 " %u %u " + "%"SCNu64 " %u %u %u %u %"SCNi64 " %"SCNi64 " %"SCNi64 + " %"SCNu64 " %"SCNu64 "\n", + &p->shm_perm.key, + &p->shm_perm.id, + &p->shm_perm.mode, + &p->shm_segsz, + &p->shm_cprid, + &p->shm_lprid, + &p->shm_nattch, + &p->shm_perm.uid, + &p->shm_perm.gid, + &p->shm_perm.cuid, + &p->shm_perm.cgid, + &p->shm_atim, + &p->shm_dtim, + &p->shm_ctim, + &p->shm_rss, + &p->shm_swp) < 14) + continue; /* invalid line, skipped */ + + if (id > -1) { + /* ID specified */ + if (id == p->shm_perm.id) { + i = 1; + break; + } else + continue; + } + + p->next = xcalloc(1, sizeof(struct shm_data)); + p = p->next; + p->next = NULL; + i++; + } + + if (i == 0) + free(*shmds); + fclose(f); + return i; + + /* Fallback; /proc or /sys file(s) missing. */ +shm_fallback: + maxid = shmctl(0, SHM_INFO, &dummy); + + for (int j = 0; j <= maxid; j++) { + int shmid; + struct shmid_ds shmseg; + struct ipc_perm *ipcp = &shmseg.shm_perm; + + shmid = shmctl(j, SHM_STAT, &shmseg); + if (shmid < 0 || (id > -1 && shmid != id)) { + continue; + } + + i++; + p->shm_perm.key = ipcp->KEY; + p->shm_perm.id = shmid; + p->shm_perm.mode = ipcp->mode; + p->shm_segsz = shmseg.shm_segsz; + p->shm_cprid = shmseg.shm_cpid; + p->shm_lprid = shmseg.shm_lpid; + p->shm_nattch = shmseg.shm_nattch; + p->shm_perm.uid = ipcp->uid; + p->shm_perm.gid = ipcp->gid; + p->shm_perm.cuid = ipcp->cuid; + p->shm_perm.cgid = ipcp->cuid; + p->shm_atim = shmseg.shm_atime; + p->shm_dtim = shmseg.shm_dtime; + p->shm_ctim = shmseg.shm_ctime; + p->shm_rss = 0xdead; + p->shm_swp = 0xdead; + + if (id < 0) { + p->next = xcalloc(1, sizeof(struct shm_data)); + p = p->next; + p->next = NULL; + } else + break; + } + + if (i == 0) + free(*shmds); + return i; +} + +void ipc_shm_free_info(struct shm_data *shmds) +{ + while (shmds) { + struct shm_data *next = shmds->next; + free(shmds); + shmds = next; + } +} + +static void get_sem_elements(struct sem_data *p) +{ + size_t i; + + if (!p || !p->sem_nsems || p->sem_perm.id < 0) + return; + + p->elements = xcalloc(p->sem_nsems, sizeof(struct sem_elem)); + + for (i = 0; i < p->sem_nsems; i++) { + struct sem_elem *e = &p->elements[i]; + union semun arg = { .val = 0 }; + + e->semval = semctl(p->sem_perm.id, i, GETVAL, arg); + if (e->semval < 0) + err(EXIT_FAILURE, _("%s failed"), "semctl(GETVAL)"); + + e->ncount = semctl(p->sem_perm.id, i, GETNCNT, arg); + if (e->ncount < 0) + err(EXIT_FAILURE, _("%s failed"), "semctl(GETNCNT)"); + + e->zcount = semctl(p->sem_perm.id, i, GETZCNT, arg); + if (e->zcount < 0) + err(EXIT_FAILURE, _("%s failed"), "semctl(GETZCNT)"); + + e->pid = semctl(p->sem_perm.id, i, GETPID, arg); + if (e->pid < 0) + err(EXIT_FAILURE, _("%s failed"), "semctl(GETPID)"); + } +} + +int ipc_sem_get_info(int id, struct sem_data **semds) +{ + FILE *f; + int i = 0, maxid; + struct sem_data *p; + struct seminfo dummy; + union semun arg; + + p = *semds = xcalloc(1, sizeof(struct sem_data)); + p->next = NULL; + + f = fopen(_PATH_PROC_SYSV_SEM, "r"); + if (!f) + goto sem_fallback; + + while (fgetc(f) != '\n') ; /* skip header */ + + while (feof(f) == 0) { + if (fscanf(f, + "%d %d %o %" SCNu64 " %u %u %u %u %" + SCNi64 " %" SCNi64 "\n", + &p->sem_perm.key, + &p->sem_perm.id, + &p->sem_perm.mode, + &p->sem_nsems, + &p->sem_perm.uid, + &p->sem_perm.gid, + &p->sem_perm.cuid, + &p->sem_perm.cgid, + &p->sem_otime, + &p->sem_ctime) != 10) + continue; + + if (id > -1) { + /* ID specified */ + if (id == p->sem_perm.id) { + get_sem_elements(p); + i = 1; + break; + } else + continue; + } + + p->next = xcalloc(1, sizeof(struct sem_data)); + p = p->next; + p->next = NULL; + i++; + } + + if (i == 0) + free(*semds); + fclose(f); + return i; + + /* Fallback; /proc or /sys file(s) missing. */ +sem_fallback: + arg.array = (ushort *) (void *)&dummy; + maxid = semctl(0, 0, SEM_INFO, arg); + + for (int j = 0; j <= maxid; j++) { + int semid; + struct semid_ds semseg; + struct ipc_perm *ipcp = &semseg.sem_perm; + arg.buf = (struct semid_ds *)&semseg; + + semid = semctl(j, 0, SEM_STAT, arg); + if (semid < 0 || (id > -1 && semid != id)) { + continue; + } + + i++; + p->sem_perm.key = ipcp->KEY; + p->sem_perm.id = semid; + p->sem_perm.mode = ipcp->mode; + p->sem_nsems = semseg.sem_nsems; + p->sem_perm.uid = ipcp->uid; + p->sem_perm.gid = ipcp->gid; + p->sem_perm.cuid = ipcp->cuid; + p->sem_perm.cgid = ipcp->cuid; + p->sem_otime = semseg.sem_otime; + p->sem_ctime = semseg.sem_ctime; + + if (id < 0) { + p->next = xcalloc(1, sizeof(struct sem_data)); + p = p->next; + p->next = NULL; + i++; + } else { + get_sem_elements(p); + break; + } + } + + if (i == 0) + free(*semds); + return i; +} + +void ipc_sem_free_info(struct sem_data *semds) +{ + while (semds) { + struct sem_data *next = semds->next; + free(semds->elements); + free(semds); + semds = next; + } +} + +int ipc_msg_get_info(int id, struct msg_data **msgds) +{ + FILE *f; + int i = 0, maxid; + struct msg_data *p; + struct msqid_ds dummy; + struct msqid_ds msgseg; + + p = *msgds = xcalloc(1, sizeof(struct msg_data)); + p->next = NULL; + + f = fopen(_PATH_PROC_SYSV_MSG, "r"); + if (!f) + goto msg_fallback; + + while (fgetc(f) != '\n') ; /* skip header */ + + while (feof(f) == 0) { + if (fscanf(f, + "%d %d %o %" SCNu64 " %" SCNu64 + " %u %u %u %u %u %u %" SCNi64 " %" SCNi64 " %" SCNi64 "\n", + &p->msg_perm.key, + &p->msg_perm.id, + &p->msg_perm.mode, + &p->q_cbytes, + &p->q_qnum, + &p->q_lspid, + &p->q_lrpid, + &p->msg_perm.uid, + &p->msg_perm.gid, + &p->msg_perm.cuid, + &p->msg_perm.cgid, + &p->q_stime, + &p->q_rtime, + &p->q_ctime) != 14) + continue; + + if (id > -1) { + /* ID specified */ + if (id == p->msg_perm.id) { + if (msgctl(id, IPC_STAT, &msgseg) != -1) + p->q_qbytes = msgseg.msg_qbytes; + i = 1; + break; + } else + continue; + } + + p->next = xcalloc(1, sizeof(struct msg_data)); + p = p->next; + p->next = NULL; + i++; + } + + if (i == 0) + free(*msgds); + fclose(f); + return i; + + /* Fallback; /proc or /sys file(s) missing. */ +msg_fallback: + maxid = msgctl(0, MSG_INFO, &dummy); + + for (int j = 0; j <= maxid; j++) { + int msgid; + struct ipc_perm *ipcp = &msgseg.msg_perm; + + msgid = msgctl(j, MSG_STAT, &msgseg); + if (msgid < 0 || (id > -1 && msgid != id)) { + continue; + } + + i++; + p->msg_perm.key = ipcp->KEY; + p->msg_perm.id = msgid; + p->msg_perm.mode = ipcp->mode; + p->q_cbytes = msgseg.msg_cbytes; + p->q_qnum = msgseg.msg_qnum; + p->q_lspid = msgseg.msg_lspid; + p->q_lrpid = msgseg.msg_lrpid; + p->msg_perm.uid = ipcp->uid; + p->msg_perm.gid = ipcp->gid; + p->msg_perm.cuid = ipcp->cuid; + p->msg_perm.cgid = ipcp->cgid; + p->q_stime = msgseg.msg_stime; + p->q_rtime = msgseg.msg_rtime; + p->q_ctime = msgseg.msg_ctime; + p->q_qbytes = msgseg.msg_qbytes; + + if (id < 0) { + p->next = xcalloc(1, sizeof(struct msg_data)); + p = p->next; + p->next = NULL; + } else + break; + } + + if (i == 0) + free(*msgds); + return i; +} + +void ipc_msg_free_info(struct msg_data *msgds) +{ + while (msgds) { + struct msg_data *next = msgds->next; + free(msgds); + msgds = next; + } +} + +void ipc_print_perms(FILE *f, struct ipc_stat *is) +{ + struct passwd *pw; + struct group *gr; + + fprintf(f, "%-10d %-10o", is->id, is->mode & 0777); + + if ((pw = getpwuid(is->cuid))) + fprintf(f, " %-10s", pw->pw_name); + else + fprintf(f, " %-10u", is->cuid); + + if ((gr = getgrgid(is->cgid))) + fprintf(f, " %-10s", gr->gr_name); + else + fprintf(f, " %-10u", is->cgid); + + if ((pw = getpwuid(is->uid))) + fprintf(f, " %-10s", pw->pw_name); + else + fprintf(f, " %-10u", is->uid); + + if ((gr = getgrgid(is->gid))) + fprintf(f, " %-10s\n", gr->gr_name); + else + fprintf(f, " %-10u\n", is->gid); +} + +void ipc_print_size(int unit, char *msg, uint64_t size, const char *end, + int width) +{ + char format[32]; + + if (!msg) + /* NULL */ ; + else if (msg[strlen(msg) - 1] == '=') + printf("%s", msg); + else if (unit == IPC_UNIT_BYTES) + printf(_("%s (bytes) = "), msg); + else if (unit == IPC_UNIT_KB) + printf(_("%s (kbytes) = "), msg); + else + printf("%s = ", msg); + + switch (unit) { + case IPC_UNIT_DEFAULT: + case IPC_UNIT_BYTES: + sprintf(format, "%%%dju", width); + printf(format, size); + break; + case IPC_UNIT_KB: + sprintf(format, "%%%dju", width); + printf(format, size / 1024); + break; + case IPC_UNIT_HUMAN: + { + char *tmp; + sprintf(format, "%%%ds", width); + printf(format, (tmp = size_to_human_string(SIZE_SUFFIX_1LETTER, size))); + free(tmp); + break; + } + default: + /* impossible occurred */ + abort(); + } + + if (end) + printf("%s", end); +} diff --git a/sys-utils/ipcutils.h b/sys-utils/ipcutils.h new file mode 100644 index 0000000..db85f57 --- /dev/null +++ b/sys-utils/ipcutils.h @@ -0,0 +1,187 @@ +#ifndef UTIL_LINUX_IPCUTILS_H +#define UTIL_LINUX_IPCUTILS_H + +#include <stdio.h> +#include <stdlib.h> +#include <sys/ipc.h> +#include <sys/msg.h> +#include <sys/sem.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#include <grp.h> +#include <pwd.h> +#include <stdint.h> + +/* + * SHM_DEST and SHM_LOCKED are defined in kernel headers, but inside + * #ifdef __KERNEL__ ... #endif + */ +#ifndef SHM_DEST + /* shm_mode upper byte flags */ +# define SHM_DEST 01000 /* segment will be destroyed on last detach */ +# define SHM_LOCKED 02000 /* segment will not be swapped */ +#endif + +/* For older kernels the same holds for the defines below */ +#ifndef MSG_STAT +# define MSG_STAT 11 +# define MSG_INFO 12 +#endif + +#ifndef SHM_STAT +# define SHM_STAT 13 +# define SHM_INFO 14 +struct shm_info { + int used_ids; + unsigned long shm_tot; /* total allocated shm */ + unsigned long shm_rss; /* total resident shm */ + unsigned long shm_swp; /* total swapped shm */ + unsigned long swap_attempts; + unsigned long swap_successes; +}; +#endif + +#ifndef SEM_STAT +# define SEM_STAT 18 +# define SEM_INFO 19 +#endif + +/* Some versions of libc only define IPC_INFO when __USE_GNU is defined. */ +#ifndef IPC_INFO +# define IPC_INFO 3 +#endif + +/* + * * The last arg of semctl is a union semun, but where is it defined? X/OPEN + * * tells us to define it ourselves, but until recently Linux include files + * * would also define it. + * */ +#ifndef HAVE_UNION_SEMUN +/* according to X/OPEN we have to define it ourselves */ +union semun { + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; +}; +#endif + +/* + * X/OPEN (Jan 1987) does not define fields key, seq in struct ipc_perm; + * glibc-1.09 has no support for sysv ipc. + * glibc 2 uses __key, __seq + */ +#if defined (__GLIBC__) && __GLIBC__ >= 2 +# define KEY __key +#else +# define KEY key +#endif + +/* Size printing in ipcs is using these. */ +enum { + IPC_UNIT_DEFAULT, + IPC_UNIT_BYTES, + IPC_UNIT_KB, + IPC_UNIT_HUMAN +}; + +struct ipc_limits { + uint64_t shmmni; /* max number of segments */ + uint64_t shmmax; /* max segment size */ + uint64_t shmall; /* max total shared memory */ + uint64_t shmmin; /* min segment size */ + + int semmni; /* max number of arrays */ + int semmsl; /* max semaphores per array */ + int semmns; /* max semaphores system wide */ + int semopm; /* max ops per semop call */ + unsigned int semvmx; /* semaphore max value (constant) */ + + int msgmni; /* max queues system wide */ + uint64_t msgmax; /* max size of message */ + int msgmnb; /* default max size of queue */ +}; + +extern int ipc_msg_get_limits(struct ipc_limits *lim); +extern int ipc_sem_get_limits(struct ipc_limits *lim); +extern int ipc_shm_get_limits(struct ipc_limits *lim); + +struct ipc_stat { + int id; + key_t key; + uid_t uid; /* current uid */ + gid_t gid; /* current gid */ + uid_t cuid; /* creator uid */ + gid_t cgid; /* creator gid */ + unsigned int mode; +}; + +extern void ipc_print_perms(FILE *f, struct ipc_stat *is); +extern void ipc_print_size(int unit, char *msg, uint64_t size, const char *end, int width); + +/* See 'struct shmid_kernel' in kernel sources + */ +struct shm_data { + struct ipc_stat shm_perm; + + uint64_t shm_nattch; + uint64_t shm_segsz; + int64_t shm_atim; /* __kernel_time_t is signed long */ + int64_t shm_dtim; + int64_t shm_ctim; + pid_t shm_cprid; + pid_t shm_lprid; + uint64_t shm_rss; + uint64_t shm_swp; + + struct shm_data *next; +}; + +extern int ipc_shm_get_info(int id, struct shm_data **shmds); +extern void ipc_shm_free_info(struct shm_data *shmds); + +/* See 'struct sem_array' in kernel sources + */ +struct sem_elem { + int semval; + int ncount; /* processes waiting on increase semval */ + int zcount; /* processes waiting on semval set to zero */ + pid_t pid; /* process last executed semop(2) call */ +}; +struct sem_data { + struct ipc_stat sem_perm; + + int64_t sem_ctime; + int64_t sem_otime; + uint64_t sem_nsems; + + struct sem_elem *elements; + struct sem_data *next; +}; + +extern int ipc_sem_get_info(int id, struct sem_data **semds); +extern void ipc_sem_free_info(struct sem_data *semds); + +/* See 'struct msg_queue' in kernel sources + */ +struct msg_data { + struct ipc_stat msg_perm; + + int64_t q_stime; + int64_t q_rtime; + int64_t q_ctime; + uint64_t q_cbytes; + uint64_t q_qnum; + uint64_t q_qbytes; + pid_t q_lspid; + pid_t q_lrpid; + + struct msg_data *next; +}; + +extern int ipc_msg_get_info(int id, struct msg_data **msgds); +extern void ipc_msg_free_info(struct msg_data *msgds); + +#endif /* UTIL_LINUX_IPCUTILS_H */ diff --git a/sys-utils/ldattach.8 b/sys-utils/ldattach.8 new file mode 100644 index 0000000..1b4683d --- /dev/null +++ b/sys-utils/ldattach.8 @@ -0,0 +1,155 @@ +.\" Copyright 2008 Tilman Schmidt (tilman@imap.cc) +.\" May be distributed under the GNU General Public License version 2 or later +.TH LDATTACH 8 "July 2014" "util-linux" "System Administration" +.SH NAME +ldattach \- attach a line discipline to a serial line +.SH SYNOPSIS +.B ldattach +.RB [ \-1278denoVh ] +.RB [ \-i +.IR iflag ] +.RB [ \-s +.IR speed ] +.I ldisc device +.SH DESCRIPTION +The +.B ldattach +daemon opens the specified +.I device +file +(which should refer to a serial device) +and attaches the line discipline +.I ldisc +to it for processing of the sent and/or received data. +It then goes into the background keeping the device open so that the +line discipline stays loaded. +.sp +The line discipline +.I ldisc +may be specified either by name +or by number. +.sp +In order to detach the line discipline, +.BR kill (1) +the +.B ldattach +process. +.sp +With no arguments, +.B ldattach +prints usage information. +.SH LINE DISCIPLINES +Depending on the kernel release, the following line disciplines are supported: +.TP +.BR TTY ( 0 ) +The default line discipline, +providing transparent operation (raw mode) +as well as the habitual terminal line editing capabilities (cooked mode). +.TP +.BR SLIP ( 1 ) +Serial Line IP (SLIP) protocol processor +for transmitting TCP/IP packets over serial lines. +.TP +.BR MOUSE ( 2 ) +Device driver for RS232 connected pointing devices (serial mice). +.TP +.BR PPP ( 3 ) +Point to Point Protocol (PPP) processor +for transmitting network packets over serial lines. +.TP +.BR STRIP ( 4 ) +.TP +.BR AX25 ( 5 ) +.TP +.BR X25 ( 6 ) +Line driver for transmitting X.25 packets over asynchronous serial lines. +.TP +.BR 6PACK ( 7 ) +.TP +.BR R3964 ( 9 ) +Driver for Simatic R3964 module. +.TP +.BR IRDA ( 11 ) +Linux IrDa (infrared data transmission) driver - +see http://irda.sourceforge.net/ +.TP +.BR HDLC ( 13 ) +Synchronous HDLC driver. +.TP +.BR SYNC_PPP ( 14 ) +Synchronous PPP driver. +.TP +.BR HCI ( 15 ) +Bluetooth HCI UART driver. +.TP +.BR GIGASET_M101 ( 16 ) +Driver for Siemens Gigaset M101 serial DECT adapter. +.TP +.BR PPS ( 18 ) +Driver for serial line Pulse Per Second (PPS) source. +.TP +.BR GSM0710 ( 21 ) +Driver for GSM 07.10 multiplexing protocol modem (CMUX). +.SH OPTIONS +.TP +.BR \-1 , " \-\-onestopbit" +Set the number of stop bits of the serial line to one. +.TP +.BR \-2 , " \-\-twostopbits" +Set the number of stop bits of the serial line to two. +.TP +.BR \-7 , " \-\-sevenbits" +Set the character size of the serial line to 7 bits. +.TP +.BR \-8 , " \-\-eightbits" +Set the character size of the serial line to 8 bits. +.TP +.BR \-d , " \-\-debug" +Keep +.B ldattach +in the foreground so that it can be interrupted or debugged, +and to print verbose messages about its progress to standard error output. +.TP +.BR \-e , " \-\-evenparity" +Set the parity of the serial line to even. +.TP +.BR -i , " --iflag " [ \- ] \fIvalue\fR... +Set the specified bits in the c_iflag word of the serial line. +The given \fIvalue\fP may be a number or a symbolic name. +If \fIvalue\fP is prefixed by a minus sign, the specified bits are cleared +instead. Several comma-separated values may be given in order to +set and clear multiple bits. +.TP +.BR \-n , " \-\-noparity" +Set the parity of the serial line to none. +.TP +.BR \-o , " \-\-oddparity" +Set the parity of the serial line to odd. +.TP +.BR \-s , " \-\-speed " \fIvalue +Set the speed (the baud rate) of the serial line to the specified \fIvalue\fR. +.TP +.BR \-c , " \-\-intro\-command " \fIstring +Define an intro command that is sent through the serial line before the invocation +of ldattach. E.g. in conjunction with line discipline GSM0710, the command +\'AT+CMUX=0\\r\' is commonly suitable to switch the modem into the CMUX mode. +.TP +.BR \-p , " \-\-pause " \fIvalue +Sleep for \fIvalue\fR seconds before the invocation of ldattach. Default is one second. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH "SEE ALSO" +.BR inputattach (1), +.BR ttys (4) +.SH AUTHOR +.nf +Tilman Schmidt (tilman@imap.cc) +.fi +.SH AVAILABILITY +The ldattach command is part of the util-linux package +and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/ldattach.c b/sys-utils/ldattach.c new file mode 100644 index 0000000..d33d685 --- /dev/null +++ b/sys-utils/ldattach.c @@ -0,0 +1,489 @@ +/* line discipline loading daemon + * open a serial device and attach a line discipline on it + * + * Usage: + * ldattach GIGASET_M101 /dev/ttyS0 + * + * ===================================================================== + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * ===================================================================== + */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <termios.h> +#include <unistd.h> + +#include "c.h" +#include "all-io.h" +#include "nls.h" +#include "strutils.h" +#include "closestream.h" + +#include <signal.h> +#include <sys/socket.h> +#include <linux/if.h> + +#include <linux/tty.h> /* for N_GSM0710 */ + +#ifdef LINUX_GSMMUX_H +# include <linux/gsmmux.h> /* Add by guowenxue */ +#else +struct gsm_config +{ + unsigned int adaption; + unsigned int encapsulation; + unsigned int initiator; + unsigned int t1; + unsigned int t2; + unsigned int t3; + unsigned int n2; + unsigned int mru; + unsigned int mtu; + unsigned int k; + unsigned int i; + unsigned int unused[8]; /* Padding for expansion without + breaking stuff */ +}; +# define GSMIOC_GETCONF _IOR('G', 0, struct gsm_config) +# define GSMIOC_SETCONF _IOW('G', 1, struct gsm_config) +#endif + +#ifndef N_GIGASET_M101 +# define N_GIGASET_M101 16 +#endif + +#ifndef N_PPS +# define N_PPS 18 +#endif + +#ifndef N_GSM0710 +# define N_GSM0710 21 +#endif + +#define MAXINTROPARMLEN 32 + +/* attach a line discipline ioctl */ +#ifndef TIOCSETD +# define TIOCSETD 0x5423 +#endif + +static int debug = 0; + +struct ld_table { + const char *name; + int value; +}; + +/* currently supported line disciplines, plus some aliases */ +static const struct ld_table ld_discs[] = { + { "TTY", N_TTY }, + { "SLIP", N_SLIP }, + { "MOUSE", N_MOUSE }, + { "PPP", N_PPP }, + { "STRIP", N_STRIP }, + { "AX25", N_AX25 }, + { "X25", N_X25 }, + { "6PACK", N_6PACK }, + { "R3964", N_R3964 }, + { "IRDA", N_IRDA }, + { "HDLC", N_HDLC }, + { "SYNC_PPP", N_SYNC_PPP }, + { "SYNCPPP", N_SYNC_PPP }, + { "HCI", N_HCI }, + { "GIGASET_M101", N_GIGASET_M101 }, + { "M101", N_GIGASET_M101 }, + { "GIGASET", N_GIGASET_M101 }, + { "PPS", N_PPS }, + { "GSM0710", N_GSM0710}, + { NULL, 0 } +}; + +/* known c_iflag names */ +static const struct ld_table ld_iflags[] = +{ + { "IGNBRK", IGNBRK }, + { "BRKINT", BRKINT }, + { "IGNPAR", IGNPAR }, + { "PARMRK", PARMRK }, + { "INPCK", INPCK }, + { "ISTRIP", ISTRIP }, + { "INLCR", INLCR }, + { "IGNCR", IGNCR }, + { "ICRNL", ICRNL }, + { "IUCLC", IUCLC }, + { "IXON", IXON }, + { "IXANY", IXANY }, + { "IXOFF", IXOFF }, + { "IMAXBEL", IMAXBEL }, + { "IUTF8", IUTF8 }, + { NULL, 0 } +}; + +static void dbg(char *fmt, ...) +{ + va_list args; + + if (debug == 0) + return; + fflush(NULL); + va_start(args, fmt); +#ifdef HAVE_VWARNX + vwarnx(fmt, args); +#else + fprintf(stderr, "%s: ", program_invocation_short_name); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); +#endif + va_end(args); + fflush(NULL); + return; +} + +static int lookup_table(const struct ld_table *tab, const char *str) +{ + const struct ld_table *t; + + for (t = tab; t && t->name; t++) + if (!strcasecmp(t->name, str)) + return t->value; + return -1; +} + +static void print_table(FILE * out, const struct ld_table *tab) +{ + const struct ld_table *t; + int i; + + for (t = tab, i = 1; t && t->name; t++, i++) { + fprintf(out, " %-12s", t->name); + if (!(i % 5)) + fputc('\n', out); + } +} + +static int parse_iflag(char *str, int *set_iflag, int *clr_iflag) +{ + int iflag; + char *s; + + for (s = strtok(str, ","); s != NULL; s = strtok(NULL, ",")) { + if (*s == '-') + s++; + if ((iflag = lookup_table(ld_iflags, s)) < 0) + iflag = strtos32_or_err(s, _("invalid iflag")); + if (s > str && *(s - 1) == '-') + *clr_iflag |= iflag; + else + *set_iflag |= iflag; + } + dbg("iflag (set/clear): %d/%d", *set_iflag, *clr_iflag); + return 0; +} + + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] <ldisc> <device>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Attach a line discipline to a serial line.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -d, --debug print verbose messages to stderr\n"), out); + fputs(_(" -s, --speed <value> set serial line speed\n"), out); + fputs(_(" -c, --intro-command <string> intro sent before ldattach\n"), out); + fputs(_(" -p, --pause <seconds> pause between intro and ldattach\n"), out); + fputs(_(" -7, --sevenbits set character size to 7 bits\n"), out); + fputs(_(" -8, --eightbits set character size to 8 bits\n"), out); + fputs(_(" -n, --noparity set parity to none\n"), out); + fputs(_(" -e, --evenparity set parity to even\n"), out); + fputs(_(" -o, --oddparity set parity to odd\n"), out); + fputs(_(" -1, --onestopbit set stop bits to one\n"), out); + fputs(_(" -2, --twostopbits set stop bits to two\n"), out); + fputs(_(" -i, --iflag [-]<iflag> set input mode flag\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(25)); + + fputs(_("\nKnown <ldisc> names:\n"), out); + print_table(out, ld_discs); + fputs(USAGE_SEPARATOR, out); + + fputs(_("\nKnown <iflag> names:\n"), out); + print_table(out, ld_iflags); + + printf(USAGE_MAN_TAIL("ldattach(8)")); + exit(EXIT_SUCCESS); +} + +static int my_cfsetspeed(struct termios *ts, int speed) +{ + /* Standard speeds + * -- cfsetspeed() is able to translate number to Bxxx constants + */ + if (cfsetspeed(ts, speed) == 0) + return 0; + + /* Nonstandard speeds + * -- we have to bypass glibc and set the speed manually (because glibc + * checks for speed and supports Bxxx bit rates only)... + */ +#ifdef _HAVE_STRUCT_TERMIOS_C_ISPEED +# define BOTHER 0010000 /* non standard rate */ + dbg("using non-standard speeds"); + ts->c_ospeed = ts->c_ispeed = speed; + ts->c_cflag &= ~CBAUD; + ts->c_cflag |= BOTHER; + return 0; +#else + return -1; +#endif +} + +static void handler(int s) +{ + dbg("got SIG %i -> exiting", s); + exit(EXIT_SUCCESS); +} + +static void gsm0710_set_conf(int tty_fd) +{ + struct gsm_config c; + + /* Add by guowenxue */ + /* get n_gsm configuration */ + ioctl(tty_fd, GSMIOC_GETCONF, &c); + /* we are initiator and need encoding 0 (basic) */ + c.initiator = 1; + c.encapsulation = 0; + /* our modem defaults to a maximum size of 127 bytes */ + c.mru = 127; + c.mtu = 127; + /* set the new configuration */ + ioctl(tty_fd, GSMIOC_SETCONF, &c); + /* Add by guowenxue end*/ +} + +int main(int argc, char **argv) +{ + int tty_fd; + struct termios ts; + int speed = 0, bits = '-', parity = '-', stop = '-'; + int set_iflag = 0, clr_iflag = 0; + int ldisc; + int optc; + char *dev; + int intropause = 1; + char *introparm = NULL; + + static const struct option opttbl[] = { + {"speed", required_argument, NULL, 's'}, + {"sevenbits", no_argument, NULL, '7'}, + {"eightbits", no_argument, NULL, '8'}, + {"noparity", no_argument, NULL, 'n'}, + {"evenparity", no_argument, NULL, 'e'}, + {"oddparity", no_argument, NULL, 'o'}, + {"onestopbit", no_argument, NULL, '1'}, + {"twostopbits", no_argument, NULL, '2'}, + {"iflag", required_argument, NULL, 'i'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {"debug", no_argument, NULL, 'd'}, + {"intro-command", no_argument, NULL, 'c'}, + {"pause", no_argument, NULL, 'p'}, + {NULL, 0, NULL, 0} + }; + + signal(SIGKILL, handler); + signal(SIGINT, handler); + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + /* parse options */ + if (argc == 0) + errx(EXIT_FAILURE, _("bad usage")); + + while ((optc = + getopt_long(argc, argv, "dhV78neo12s:i:c:p:", opttbl, + NULL)) >= 0) { + switch (optc) { + case 'd': + debug = 1; + break; + case '1': + case '2': + stop = optc; + break; + case '7': + case '8': + bits = optc; + break; + case 'n': + case 'e': + case 'o': + parity = optc; + break; + case 's': + speed = strtos32_or_err(optarg, _("invalid speed argument")); + break; + case 'p': + intropause = strtou32_or_err(optarg, _("invalid pause argument")); + if (intropause > 10) + errx(EXIT_FAILURE, "invalid pause: %s", optarg); + break; + case 'c': + introparm = optarg; + break; + case 'i': + parse_iflag(optarg, &set_iflag, &clr_iflag); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (argc - optind != 2) { + warnx(_("not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + /* parse line discipline specification */ + ldisc = lookup_table(ld_discs, argv[optind]); + if (ldisc < 0) + ldisc = strtos32_or_err(argv[optind], _("invalid line discipline argument")); + + /* ldisc specific option settings */ + if (ldisc == N_GIGASET_M101) { + /* device specific defaults for line speed and data format */ + if (speed == 0) + speed = 115200; + if (bits == '-') + bits = '8'; + if (parity == '-') + parity = 'n'; + if (stop == '-') + stop = '1'; + } + + /* open device */ + dev = argv[optind + 1]; + if ((tty_fd = open(dev, O_RDWR | O_NOCTTY)) < 0) + err(EXIT_FAILURE, _("cannot open %s"), dev); + if (!isatty(tty_fd)) + errx(EXIT_FAILURE, _("%s is not a serial line"), dev); + + dbg("opened %s", dev); + + /* set line speed and format */ + if (tcgetattr(tty_fd, &ts) < 0) + err(EXIT_FAILURE, + _("cannot get terminal attributes for %s"), dev); + cfmakeraw(&ts); + if (speed && my_cfsetspeed(&ts, speed) < 0) + errx(EXIT_FAILURE, _("speed %d unsupported"), speed); + + switch (stop) { + case '1': + ts.c_cflag &= ~CSTOPB; + break; + case '2': + ts.c_cflag |= CSTOPB; + break; + case '-': + break; + default: + abort(); + } + switch (bits) { + case '7': + ts.c_cflag = (ts.c_cflag & ~CSIZE) | CS7; + break; + case '8': + ts.c_cflag = (ts.c_cflag & ~CSIZE) | CS8; + break; + case '-': + break; + default: + abort(); + } + switch (parity) { + case 'n': + ts.c_cflag &= ~(PARENB | PARODD); + break; + case 'e': + ts.c_cflag |= PARENB; + ts.c_cflag &= ~PARODD; + break; + case 'o': + ts.c_cflag |= (PARENB | PARODD); + break; + case '-': + break; + default: + abort(); + } + + ts.c_cflag |= CREAD; /* just to be on the safe side */ + ts.c_iflag |= set_iflag; + ts.c_iflag &= ~clr_iflag; + + if (tcsetattr(tty_fd, TCSAFLUSH, &ts) < 0) + err(EXIT_FAILURE, + _("cannot set terminal attributes for %s"), dev); + + dbg("set to raw %d %c%c%c: cflag=0x%x", + speed, bits, parity, stop, ts.c_cflag); + + if (introparm && *introparm) + { + dbg("intro command is '%s'", introparm); + if (write_all(tty_fd, introparm, strlen(introparm)) != 0) + err(EXIT_FAILURE, + _("cannot write intro command to %s"), dev); + + if (intropause) { + dbg("waiting for %d seconds", intropause); + sleep(intropause); + } + } + + /* Attach the line discipline. */ + if (ioctl(tty_fd, TIOCSETD, &ldisc) < 0) + err(EXIT_FAILURE, _("cannot set line discipline")); + + dbg("line discipline set to %d", ldisc); + + /* ldisc specific post-attach actions */ + if (ldisc == N_GSM0710) + gsm0710_set_conf(tty_fd); + + /* Go into background if not in debug mode. */ + if (!debug && daemon(0, 0) < 0) + err(EXIT_FAILURE, _("cannot daemonize")); + + /* Sleep to keep the line discipline active. */ + pause(); + + exit(EXIT_SUCCESS); +} diff --git a/sys-utils/losetup.8 b/sys-utils/losetup.8 new file mode 100644 index 0000000..c31d747 --- /dev/null +++ b/sys-utils/losetup.8 @@ -0,0 +1,208 @@ +.TH LOSETUP 8 "November 2015" "util-linux" "System Administration" +.SH NAME +losetup \- set up and control loop devices +.SH SYNOPSIS +.ad l +Get info: +.sp +.in +5 +.B losetup +[\fIloopdev\fP] +.sp +.B losetup -l +.RB [ \-a ] +.sp +.B losetup -j +.I file +.RB [ \-o +.IR offset ] +.sp +.in -5 +Detach a loop device: +.sp +.in +5 +.B "losetup \-d" +.IR loopdev ... +.sp +.in -5 +Detach all associated loop devices: +.sp +.in +5 +.B "losetup \-D" +.sp +.in -5 +Set up a loop device: +.sp +.in +5 +.B losetup +.RB [ \-o +.IR offset ] +.RB [ \-\-sizelimit +.IR size ] +.RB [ \-\-sector\-size +.IR size ] +.in +8 +.RB [ \-Pr ] +.RB [ \-\-show ] " \-f" | \fIloopdev\fP +.I file +.sp +.in -13 +Resize a loop device: +.sp +.in +5 +.B "losetup \-c" +.I loopdev +.in -5 +.ad b +.SH DESCRIPTION +.B losetup +is used to associate loop devices with regular files or block devices, +to detach loop devices, and to query the status of a loop device. If only the +\fIloopdev\fP argument is given, the status of the corresponding loop +device is shown. If no option is given, all loop devices are shown. +.sp +Note that the old output format (i.e., \fBlosetup -a\fR) with comma-delimited +strings is deprecated in favour of the \fB--list\fR output format. +.sp +It's possible to create more independent loop devices for the same backing +file. +.B This setup may be dangerous, can cause data loss, corruption and overwrites. +Use \fB\-\-nooverlap\fR with \fB\-\-find\fR during setup to avoid this problem. + +.SH OPTIONS +The \fIsize\fR and \fIoffset\fR +arguments may be followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" is +optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB and YB. + +.TP +.BR \-a , " \-\-all" +Show the status of all loop devices. Note that not all information is accessible +for non-root users. See also \fB\-\-list\fR. The old output format (as printed +without \fB--list)\fR is deprecated. +.TP +.BR \-d , " \-\-detach " \fIloopdev\fR... +Detach the file or device associated with the specified loop device(s). Note +that since Linux v3.7 kernel uses "lazy device destruction". The detach +operation does not return EBUSY error anymore if device is actively used by +system, but it is marked by autoclear flag and destroyed later. +.TP +.BR \-D , " \-\-detach\-all" +Detach all associated loop devices. +.TP +.BR \-f , " \-\-find " "\fR[\fIfile\fR]" +Find the first unused loop device. If a \fIfile\fR argument is present, use +the found device as loop device. Otherwise, just print its name. +.IP "\fB\-\-show\fP" +Display the name of the assigned loop device if the \fB\-f\fP option and a +\fIfile\fP argument are present. +.TP +.BR \-L , " \-\-nooverlap" +Check for conflicts between loop devices to avoid situation when the same +backing file is shared between more loop devices. If the file is already used +by another device then re-use the device rather than a new one. The option +makes sense only with \fB\-\-find\fP. +.TP +.BR \-j , " \-\-associated " \fIfile\fR " \fR[\fB\-o \fIoffset\fR]" +Show the status of all loop devices associated with the given \fIfile\fR. +.TP +.BR \-o , " \-\-offset " \fIoffset +The data start is moved \fIoffset\fP bytes into the specified file or device. The \fIoffset\fP +may be followed by the multiplicative suffixes; see above. +.IP "\fB\-\-sizelimit \fIsize\fP" +The data end is set to no more than \fIsize\fP bytes after the data start. The \fIsize\fP +may be followed by the multiplicative suffixes; see above. +.TP +.BR \-b , " \-\-sector-size " \fIsize +Set the logical sector size of the loop device in bytes (since Linux 4.14). The +option may be used when create a new loop device as well as stand-alone command +to modify sector size of the already existing loop device. +.TP +.BR \-c , " \-\-set\-capacity " \fIloopdev +Force the loop driver to reread the size of the file associated with the +specified loop device. +.TP +.BR \-P , " \-\-partscan" +Force the kernel to scan the partition table on a newly created loop device. +.TP +.BR \-r , " \-\-read\-only" +Set up a read-only loop device. +.TP +.BR \-\-direct\-io [ =on | off ] +Enable or disable direct I/O for the backing file. The optional argument +can be either \fBon\fR or \fBoff\fR. If the argument is omitted, it defaults +to \fBon\fR. +.TP +.BR \-v , " \-\-verbose" +Verbose mode. +.TP +.BR \-l , " \-\-list" +If a loop device or the \fB-a\fR option is specified, print the default columns +for either the specified loop device or all loop devices; the default is to +print info about all devices. See also \fB\-\-output\fP, \fB\-\-noheadings\fP, +\fB\-\-raw\fP, and \fB\-\-json\fP. +.TP +.BR \-O , " \-\-output " \fIcolumn\fR[,\fIcolumn\fR]... +Specify the columns that are to be printed for the \fB\-\-list\fP output. +Use \fB\-\-help\fR to get a list of all supported columns. +.TP +.B \-\-output\-all +Output all available columns. +.TP +.BR \-n , " \-\-noheadings" +Don't print headings for \fB\-\-list\fP output format. +.IP "\fB\-\-raw\fP" +Use the raw \fB\-\-list\fP output format. +.TP +.BR \-J , " \-\-json" +Use JSON format for \fB\-\-list\fP output. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH ENCRYPTION +.B Cryptoloop is no longer supported in favor of dm-crypt. +.B For more details see cryptsetup(8). + +.SH RETURN VALUE +.B losetup +returns 0 on success, nonzero on failure. When +.B losetup +displays the status of a loop device, it returns 1 if the device +is not configured and 2 if an error occurred which prevented +determining the status of the device. + +.SH FILES +.TP +.I /dev/loop[0..N] +loop block devices +.TP +.I /dev/loop-control +loop control device + +.SH EXAMPLE +The following commands can be used as an example of using the loop device. +.nf +.IP +# dd if=/dev/zero of=~/file.img bs=1024k count=10 +# losetup --find --show ~/file.img +/dev/loop0 +# mkfs -t ext2 /dev/loop0 +# mount /dev/loop0 /mnt + ... +# umount /dev/loop0 +# losetup --detach /dev/loop0 +.fi +.SH ENVIRONMENT +.IP LOOPDEV_DEBUG=all +enables debug output. +.SH AUTHORS +Karel Zak <kzak@redhat.com>, based on the original version from +Theodore Ts'o <tytso@athena.mit.edu> +.SH AVAILABILITY +The losetup command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/losetup.c b/sys-utils/losetup.c new file mode 100644 index 0000000..7d14f56 --- /dev/null +++ b/sys-utils/losetup.c @@ -0,0 +1,917 @@ +/* + * Copyright (C) 2011 Karel Zak <kzak@redhat.com> + * Originally from Ted's losetup.c + * + * losetup.c - setup and control loop devices + */ +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <inttypes.h> +#include <getopt.h> + +#include <libsmartcols.h> + +#include "c.h" +#include "nls.h" +#include "strutils.h" +#include "loopdev.h" +#include "closestream.h" +#include "optutils.h" +#include "xalloc.h" +#include "canonicalize.h" +#include "pathnames.h" + +enum { + A_CREATE = 1, /* setup a new device */ + A_DELETE, /* delete given device(s) */ + A_DELETE_ALL, /* delete all devices */ + A_SHOW, /* list devices */ + A_SHOW_ONE, /* print info about one device */ + A_FIND_FREE, /* find first unused */ + A_SET_CAPACITY, /* set device capacity */ + A_SET_DIRECT_IO, /* set accessing backing file by direct io */ + A_SET_BLOCKSIZE, /* set logical block size of the loop device */ +}; + +enum { + COL_NAME = 0, + COL_AUTOCLR, + COL_BACK_FILE, + COL_BACK_INO, + COL_BACK_MAJMIN, + COL_MAJMIN, + COL_OFFSET, + COL_PARTSCAN, + COL_RO, + COL_SIZELIMIT, + COL_DIO, + COL_LOGSEC, +}; + +/* basic output flags */ +static int no_headings; +static int raw; +static int json; + +struct colinfo { + const char *name; + double whint; + int flags; + const char *help; + + int json_type; /* default is string */ +}; + +static struct colinfo infos[] = { + [COL_AUTOCLR] = { "AUTOCLEAR", 1, SCOLS_FL_RIGHT, N_("autoclear flag set"), SCOLS_JSON_BOOLEAN}, + [COL_BACK_FILE] = { "BACK-FILE", 0.3, 0, N_("device backing file")}, + [COL_BACK_INO] = { "BACK-INO", 4, SCOLS_FL_RIGHT, N_("backing file inode number"), SCOLS_JSON_NUMBER}, + [COL_BACK_MAJMIN] = { "BACK-MAJ:MIN", 6, 0, N_("backing file major:minor device number")}, + [COL_NAME] = { "NAME", 0.25, 0, N_("loop device name")}, + [COL_OFFSET] = { "OFFSET", 5, SCOLS_FL_RIGHT, N_("offset from the beginning"), SCOLS_JSON_NUMBER}, + [COL_PARTSCAN] = { "PARTSCAN", 1, SCOLS_FL_RIGHT, N_("partscan flag set"), SCOLS_JSON_BOOLEAN}, + [COL_RO] = { "RO", 1, SCOLS_FL_RIGHT, N_("read-only device"), SCOLS_JSON_BOOLEAN}, + [COL_SIZELIMIT] = { "SIZELIMIT", 5, SCOLS_FL_RIGHT, N_("size limit of the file in bytes"), SCOLS_JSON_NUMBER}, + [COL_MAJMIN] = { "MAJ:MIN", 3, 0, N_("loop device major:minor number")}, + [COL_DIO] = { "DIO", 1, SCOLS_FL_RIGHT, N_("access backing file with direct-io"), SCOLS_JSON_BOOLEAN}, + [COL_LOGSEC] = { "LOG-SEC", 4, SCOLS_FL_RIGHT, N_("logical sector size in bytes"), SCOLS_JSON_NUMBER}, +}; + +static int columns[ARRAY_SIZE(infos) * 2] = {-1}; +static size_t ncolumns; + +static int get_column_id(int num) +{ + assert(num >= 0); + assert((size_t) num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(infos)); + return columns[num]; +} + +static struct colinfo *get_column_info(int num) +{ + return &infos[ get_column_id(num) ]; +} + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static int printf_loopdev(struct loopdev_cxt *lc) +{ + uint64_t x; + dev_t dev = 0; + ino_t ino = 0; + char *fname; + uint32_t type; + + fname = loopcxt_get_backing_file(lc); + if (!fname) + return -EINVAL; + + if (loopcxt_get_backing_devno(lc, &dev) == 0) + loopcxt_get_backing_inode(lc, &ino); + + if (!dev && !ino) { + /* + * Probably non-root user (no permissions to + * call LOOP_GET_STATUS ioctls). + */ + printf("%s: []: (%s)", + loopcxt_get_device(lc), fname); + + if (loopcxt_get_offset(lc, &x) == 0 && x) + printf(_(", offset %ju"), x); + + if (loopcxt_get_sizelimit(lc, &x) == 0 && x) + printf(_(", sizelimit %ju"), x); + goto done; + } + + printf("%s: [%04d]:%" PRIu64 " (%s)", + loopcxt_get_device(lc), (int) dev, ino, fname); + + if (loopcxt_get_offset(lc, &x) == 0 && x) + printf(_(", offset %ju"), x); + + if (loopcxt_get_sizelimit(lc, &x) == 0 && x) + printf(_(", sizelimit %ju"), x); + + if (loopcxt_get_encrypt_type(lc, &type) == 0) { + const char *e = loopcxt_get_crypt_name(lc); + + if ((!e || !*e) && type == 1) + e = "XOR"; + if (e && *e) + printf(_(", encryption %s (type %u)"), e, type); + } + +done: + free(fname); + printf("\n"); + return 0; +} + +static int show_all_loops(struct loopdev_cxt *lc, const char *file, + uint64_t offset, int flags) +{ + struct stat sbuf, *st = &sbuf; + char *cn_file = NULL; + + if (loopcxt_init_iterator(lc, LOOPITER_FL_USED)) + return -1; + + if (!file || stat(file, st)) + st = NULL; + + while (loopcxt_next(lc) == 0) { + if (file) { + int used; + const char *bf = cn_file ? cn_file : file; + + used = loopcxt_is_used(lc, st, bf, offset, 0, flags); + if (!used && !cn_file) { + bf = cn_file = canonicalize_path(file); + used = loopcxt_is_used(lc, st, bf, offset, 0, flags); + } + if (!used) + continue; + } + printf_loopdev(lc); + } + loopcxt_deinit_iterator(lc); + free(cn_file); + return 0; +} + +static int delete_loop(struct loopdev_cxt *lc) +{ + if (loopcxt_delete_device(lc)) + warn(_("%s: detach failed"), loopcxt_get_device(lc)); + else + return 0; + + return -1; +} + +static int delete_all_loops(struct loopdev_cxt *lc) +{ + int res = 0; + + if (loopcxt_init_iterator(lc, LOOPITER_FL_USED)) + return -1; + + while (loopcxt_next(lc) == 0) + res += delete_loop(lc); + + loopcxt_deinit_iterator(lc); + return res; +} + +static int set_scols_data(struct loopdev_cxt *lc, struct libscols_line *ln) +{ + size_t i; + + for (i = 0; i < ncolumns; i++) { + const char *p = NULL; /* external data */ + char *np = NULL; /* allocated here */ + uint64_t x = 0; + int rc = 0; + + switch(get_column_id(i)) { + case COL_NAME: + p = loopcxt_get_device(lc); + break; + case COL_BACK_FILE: + p = loopcxt_get_backing_file(lc); + break; + case COL_OFFSET: + if (loopcxt_get_offset(lc, &x) == 0) + xasprintf(&np, "%jd", x); + break; + case COL_SIZELIMIT: + if (loopcxt_get_sizelimit(lc, &x) == 0) + xasprintf(&np, "%jd", x); + break; + case COL_BACK_MAJMIN: + { + dev_t dev = 0; + if (loopcxt_get_backing_devno(lc, &dev) == 0 && dev) + xasprintf(&np, "%8u:%-3u", major(dev), minor(dev)); + break; + } + case COL_MAJMIN: + { + struct stat st; + + if (loopcxt_get_device(lc) + && stat(loopcxt_get_device(lc), &st) == 0 + && S_ISBLK(st.st_mode) + && major(st.st_rdev) == LOOPDEV_MAJOR) + xasprintf(&np, "%3u:%-3u", major(st.st_rdev), + minor(st.st_rdev)); + break; + } + case COL_BACK_INO: + { + ino_t ino = 0; + if (loopcxt_get_backing_inode(lc, &ino) == 0 && ino) + xasprintf(&np, "%ju", ino); + break; + } + case COL_AUTOCLR: + p = loopcxt_is_autoclear(lc) ? "1" : "0"; + break; + case COL_RO: + p = loopcxt_is_readonly(lc) ? "1" : "0"; + break; + case COL_DIO: + p = loopcxt_is_dio(lc) ? "1" : "0"; + break; + case COL_PARTSCAN: + p = loopcxt_is_partscan(lc) ? "1" : "0"; + break; + case COL_LOGSEC: + if (loopcxt_get_blocksize(lc, &x) == 0) + xasprintf(&np, "%jd", x); + break; + default: + return -EINVAL; + } + + + if (p) + rc = scols_line_set_data(ln, i, p); /* calls strdup() */ + else if (np) + rc = scols_line_refer_data(ln, i, np); /* only refers */ + + if (rc) + err(EXIT_FAILURE, _("failed to add output data")); + } + + return 0; +} + +static int show_table(struct loopdev_cxt *lc, + const char *file, + uint64_t offset, + int flags) +{ + struct stat sbuf, *st = &sbuf; + struct libscols_table *tb; + struct libscols_line *ln; + int rc = 0; + size_t i; + + scols_init_debug(0); + + if (!(tb = scols_new_table())) + err(EXIT_FAILURE, _("failed to allocate output table")); + scols_table_enable_raw(tb, raw); + scols_table_enable_json(tb, json); + scols_table_enable_noheadings(tb, no_headings); + + if (json) + scols_table_set_name(tb, "loopdevices"); + + for (i = 0; i < ncolumns; i++) { + struct colinfo *ci = get_column_info(i); + struct libscols_column *cl; + + cl = scols_table_new_column(tb, ci->name, ci->whint, ci->flags); + if (!cl) + err(EXIT_FAILURE, _("failed to allocate output column")); + if (json) + scols_column_set_json_type(cl, ci->json_type); + } + + /* only one loopdev requested (already assigned to loopdev_cxt) */ + if (loopcxt_get_device(lc)) { + ln = scols_table_new_line(tb, NULL); + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + rc = set_scols_data(lc, ln); + + /* list all loopdevs */ + } else { + char *cn_file = NULL; + + rc = loopcxt_init_iterator(lc, LOOPITER_FL_USED); + if (rc) + goto done; + if (!file || stat(file, st)) + st = NULL; + + while (loopcxt_next(lc) == 0) { + if (file) { + int used; + const char *bf = cn_file ? cn_file : file; + + used = loopcxt_is_used(lc, st, bf, offset, 0, flags); + if (!used && !cn_file) { + bf = cn_file = canonicalize_path(file); + used = loopcxt_is_used(lc, st, bf, offset, 0, flags); + } + if (!used) + continue; + } + + ln = scols_table_new_line(tb, NULL); + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + rc = set_scols_data(lc, ln); + if (rc) + break; + } + + loopcxt_deinit_iterator(lc); + free(cn_file); + } +done: + if (rc == 0) + rc = scols_print_table(tb); + scols_unref_table(tb); + return rc; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + + fprintf(out, + _(" %1$s [options] [<loopdev>]\n" + " %1$s [options] -f | <loopdev> <file>\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Set up and control loop devices.\n"), out); + + /* commands */ + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all list all used devices\n"), out); + fputs(_(" -d, --detach <loopdev>... detach one or more devices\n"), out); + fputs(_(" -D, --detach-all detach all used devices\n"), out); + fputs(_(" -f, --find find first unused device\n"), out); + fputs(_(" -c, --set-capacity <loopdev> resize the device\n"), out); + fputs(_(" -j, --associated <file> list all devices associated with <file>\n"), out); + fputs(_(" -L, --nooverlap avoid possible conflict between devices\n"), out); + + /* commands options */ + fputs(USAGE_SEPARATOR, out); + fputs(_(" -o, --offset <num> start at offset <num> into file\n"), out); + fputs(_(" --sizelimit <num> device is limited to <num> bytes of the file\n"), out); + fputs(_(" -b --sector-size <num> set the logical sector size to <num>\n"), out); + fputs(_(" -P, --partscan create a partitioned loop device\n"), out); + fputs(_(" -r, --read-only set up a read-only loop device\n"), out); + fputs(_(" --direct-io[=<on|off>] open backing file with O_DIRECT\n"), out); + fputs(_(" --show print device name after setup (with -f)\n"), out); + fputs(_(" -v, --verbose verbose mode\n"), out); + + /* output options */ + fputs(USAGE_SEPARATOR, out); + fputs(_(" -J, --json use JSON --list output format\n"), out); + fputs(_(" -l, --list list info about all or specified (default)\n"), out); + fputs(_(" -n, --noheadings don't print headings for --list output\n"), out); + fputs(_(" -O, --output <cols> specify columns to output for --list\n"), out); + fputs(_(" --output-all output all columns\n"), out); + fputs(_(" --raw use raw --list output format\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(31)); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %12s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("losetup(8)")); + + exit(EXIT_SUCCESS); +} + +static void warn_size(const char *filename, uint64_t size) +{ + struct stat st; + + if (!size) { + if (stat(filename, &st) || S_ISBLK(st.st_mode)) + return; + size = st.st_size; + } + + if (size < 512) + warnx(_("%s: Warning: file is smaller than 512 bytes; the loop device " + "may be useless or invisible for system tools."), + filename); + else if (size % 512) + warnx(_("%s: Warning: file does not fit into a 512-byte sector; " + "the end of the file will be ignored."), + filename); +} + +static int create_loop(struct loopdev_cxt *lc, + int nooverlap, int lo_flags, int flags, + const char *file, uint64_t offset, uint64_t sizelimit) +{ + int hasdev = loopcxt_has_device(lc); + int rc = 0; + + /* losetup --find --noverlap file.img */ + if (!hasdev && nooverlap) { + rc = loopcxt_find_overlap(lc, file, offset, sizelimit); + switch (rc) { + case 0: /* not found */ + break; + + case 1: /* overlap */ + loopcxt_deinit(lc); + errx(EXIT_FAILURE, _("%s: overlapping loop device exists"), file); + + case 2: /* overlap -- full size and offset match (reuse) */ + { + uint32_t lc_encrypt_type; + + /* Once a loop is initialized RO, there is no + * way to change its parameters. */ + if (loopcxt_is_readonly(lc) + && !(lo_flags & LO_FLAGS_READ_ONLY)) { + loopcxt_deinit(lc); + errx(EXIT_FAILURE, _("%s: overlapping read-only loop device exists"), file); + } + + /* This is no more supported, but check to be safe. */ + if (loopcxt_get_encrypt_type(lc, &lc_encrypt_type) == 0 + && lc_encrypt_type != LO_CRYPT_NONE) { + loopcxt_deinit(lc); + errx(EXIT_FAILURE, _("%s: overlapping encrypted loop device exists"), file); + } + + lc->info.lo_flags &= ~LO_FLAGS_AUTOCLEAR; + if (loopcxt_set_status(lc)) { + loopcxt_deinit(lc); + errx(EXIT_FAILURE, _("%s: failed to re-use loop device"), file); + } + return 0; /* success, re-use */ + } + default: /* error */ + loopcxt_deinit(lc); + errx(EXIT_FAILURE, _("failed to inspect loop devices")); + return -errno; + } + } + + if (hasdev && !is_loopdev(loopcxt_get_device(lc))) + loopcxt_add_device(lc); + + /* losetup --noverlap /dev/loopN file.img */ + if (hasdev && nooverlap) { + struct loopdev_cxt lc2; + + if (loopcxt_init(&lc2, 0)) { + loopcxt_deinit(lc); + err(EXIT_FAILURE, _("failed to initialize loopcxt")); + } + rc = loopcxt_find_overlap(&lc2, file, offset, sizelimit); + loopcxt_deinit(&lc2); + + if (rc) { + loopcxt_deinit(lc); + if (rc > 0) + errx(EXIT_FAILURE, _("%s: overlapping loop device exists"), file); + err(EXIT_FAILURE, _("%s: failed to check for conflicting loop devices"), file); + } + } + + /* Create a new device */ + do { + const char *errpre; + + /* Note that loopcxt_{find_unused,set_device}() resets + * loopcxt struct. + */ + if (!hasdev && (rc = loopcxt_find_unused(lc))) { + warnx(_("cannot find an unused loop device")); + break; + } + if (flags & LOOPDEV_FL_OFFSET) + loopcxt_set_offset(lc, offset); + if (flags & LOOPDEV_FL_SIZELIMIT) + loopcxt_set_sizelimit(lc, sizelimit); + if (lo_flags) + loopcxt_set_flags(lc, lo_flags); + if ((rc = loopcxt_set_backing_file(lc, file))) { + warn(_("%s: failed to use backing file"), file); + break; + } + errno = 0; + rc = loopcxt_setup_device(lc); + if (rc == 0) + break; /* success */ + if (errno == EBUSY && !hasdev) + continue; + + /* errors */ + errpre = hasdev && loopcxt_get_fd(lc) < 0 ? + loopcxt_get_device(lc) : file; + warn(_("%s: failed to set up loop device"), errpre); + break; + } while (hasdev == 0); + + return rc; +} + +int main(int argc, char **argv) +{ + struct loopdev_cxt lc; + int act = 0, flags = 0, no_overlap = 0, c; + char *file = NULL; + uint64_t offset = 0, sizelimit = 0, blocksize = 0; + int res = 0, showdev = 0, lo_flags = 0; + char *outarg = NULL; + int list = 0; + unsigned long use_dio = 0, set_dio = 0, set_blocksize = 0; + + enum { + OPT_SIZELIMIT = CHAR_MAX + 1, + OPT_SHOW, + OPT_RAW, + OPT_DIO, + OPT_OUTPUT_ALL + }; + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "set-capacity", required_argument, NULL, 'c' }, + { "detach", required_argument, NULL, 'd' }, + { "detach-all", no_argument, NULL, 'D' }, + { "find", no_argument, NULL, 'f' }, + { "nooverlap", no_argument, NULL, 'L' }, + { "help", no_argument, NULL, 'h' }, + { "associated", required_argument, NULL, 'j' }, + { "json", no_argument, NULL, 'J' }, + { "list", no_argument, NULL, 'l' }, + { "sector-size", required_argument, NULL, 'b' }, + { "noheadings", no_argument, NULL, 'n' }, + { "offset", required_argument, NULL, 'o' }, + { "output", required_argument, NULL, 'O' }, + { "output-all", no_argument, NULL, OPT_OUTPUT_ALL }, + { "sizelimit", required_argument, NULL, OPT_SIZELIMIT }, + { "partscan", no_argument, NULL, 'P' }, + { "read-only", no_argument, NULL, 'r' }, + { "direct-io", optional_argument, NULL, OPT_DIO }, + { "raw", no_argument, NULL, OPT_RAW }, + { "show", no_argument, NULL, OPT_SHOW }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'D','a','c','d','f','j' }, + { 'D','c','d','f','l' }, + { 'D','c','d','f','O' }, + { 'J',OPT_RAW }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + if (loopcxt_init(&lc, 0)) + err(EXIT_FAILURE, _("failed to initialize loopcxt")); + + while ((c = getopt_long(argc, argv, "ab:c:d:Dfhj:JlLno:O:PrvV", + longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'a': + act = A_SHOW; + break; + case 'b': + set_blocksize = 1; + blocksize = strtosize_or_err(optarg, _("failed to parse logical block size")); + break; + case 'c': + act = A_SET_CAPACITY; + if (!is_loopdev(optarg) || + loopcxt_set_device(&lc, optarg)) + err(EXIT_FAILURE, _("%s: failed to use device"), + optarg); + break; + case 'r': + lo_flags |= LO_FLAGS_READ_ONLY; + break; + case 'd': + act = A_DELETE; + if (!is_loopdev(optarg) || + loopcxt_set_device(&lc, optarg)) + err(EXIT_FAILURE, _("%s: failed to use device"), + optarg); + break; + case 'D': + act = A_DELETE_ALL; + break; + case 'f': + act = A_FIND_FREE; + break; + case 'h': + usage(); + break; + case 'J': + json = 1; + break; + case 'j': + act = A_SHOW; + file = optarg; + break; + case 'l': + list = 1; + break; + case 'L': + no_overlap = 1; + break; + case 'n': + no_headings = 1; + break; + case OPT_RAW: + raw = 1; + break; + case 'o': + offset = strtosize_or_err(optarg, _("failed to parse offset")); + flags |= LOOPDEV_FL_OFFSET; + break; + case 'O': + outarg = optarg; + list = 1; + break; + case OPT_OUTPUT_ALL: + for (ncolumns = 0; ncolumns < ARRAY_SIZE(infos); ncolumns++) + columns[ncolumns] = ncolumns; + break; + case 'P': + lo_flags |= LO_FLAGS_PARTSCAN; + break; + case OPT_SHOW: + showdev = 1; + break; + case OPT_DIO: + use_dio = set_dio = 1; + if (optarg) + use_dio = parse_switch(optarg, _("argument error"), "on", "off", NULL); + break; + case 'v': + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case OPT_SIZELIMIT: /* --sizelimit */ + sizelimit = strtosize_or_err(optarg, _("failed to parse size")); + flags |= LOOPDEV_FL_SIZELIMIT; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + ul_path_init_debug(); + ul_sysfs_init_debug(); + + /* default is --list --all */ + if (argc == 1) { + act = A_SHOW; + list = 1; + } + + if (!act && argc == 2 && (raw || json)) { + act = A_SHOW; + list = 1; + } + + /* default --list output columns */ + if (list && !ncolumns) { + columns[ncolumns++] = COL_NAME; + columns[ncolumns++] = COL_SIZELIMIT; + columns[ncolumns++] = COL_OFFSET; + columns[ncolumns++] = COL_AUTOCLR; + columns[ncolumns++] = COL_RO; + columns[ncolumns++] = COL_BACK_FILE; + columns[ncolumns++] = COL_DIO; + columns[ncolumns++] = COL_LOGSEC; + } + + if (act == A_FIND_FREE && optind < argc) { + /* + * losetup -f <backing_file> + */ + act = A_CREATE; + file = argv[optind++]; + + if (optind < argc) + errx(EXIT_FAILURE, _("unexpected arguments")); + } + + if (list && !act && optind == argc) + /* + * losetup --list defaults to --all + */ + act = A_SHOW; + + if (!act && optind + 1 == argc) { + /* + * losetup [--list] <device> + * OR + * losetup {--direct-io[=off]|--logical-blocksize=size}... <device> + */ + if (!(set_dio || set_blocksize)) + act = A_SHOW_ONE; + if (set_dio) + act = A_SET_DIRECT_IO; + if (set_blocksize) + act = A_SET_BLOCKSIZE; + if (!is_loopdev(argv[optind]) || + loopcxt_set_device(&lc, argv[optind])) + err(EXIT_FAILURE, _("%s: failed to use device"), + argv[optind]); + optind++; + } + if (!act) { + /* + * losetup <loopdev> <backing_file> + */ + act = A_CREATE; + + if (optind >= argc) + errx(EXIT_FAILURE, _("no loop device specified")); + /* don't use is_loopdev() here, the device does not have exist yet */ + if (loopcxt_set_device(&lc, argv[optind])) + err(EXIT_FAILURE, _("%s: failed to use device"), + argv[optind]); + optind++; + + if (optind >= argc) + errx(EXIT_FAILURE, _("no file specified")); + file = argv[optind++]; + } + + if (act != A_CREATE && + (sizelimit || lo_flags || showdev)) + errx(EXIT_FAILURE, + _("the options %s are allowed during loop device setup only"), + "--{sizelimit,read-only,show}"); + + if ((flags & LOOPDEV_FL_OFFSET) && + act != A_CREATE && (act != A_SHOW || !file)) + errx(EXIT_FAILURE, _("the option --offset is not allowed in this context")); + + if (outarg && string_add_to_idarray(outarg, columns, ARRAY_SIZE(columns), + &ncolumns, column_name_to_id) < 0) + return EXIT_FAILURE; + + switch (act) { + case A_CREATE: + res = create_loop(&lc, no_overlap, lo_flags, flags, file, offset, sizelimit); + if (res == 0) { + if (showdev) + printf("%s\n", loopcxt_get_device(&lc)); + warn_size(file, sizelimit); + if (set_dio || set_blocksize) + goto lo_set_post; + } + break; + case A_DELETE: + res = delete_loop(&lc); + while (optind < argc) { + if (!is_loopdev(argv[optind]) || + loopcxt_set_device(&lc, argv[optind])) + warn(_("%s: failed to use device"), + argv[optind]); + optind++; + res += delete_loop(&lc); + } + break; + case A_DELETE_ALL: + res = delete_all_loops(&lc); + break; + case A_FIND_FREE: + res = loopcxt_find_unused(&lc); + if (res) { + int errsv = errno; + + if (access(_PATH_DEV_LOOPCTL, F_OK) == 0 && + access(_PATH_DEV_LOOPCTL, W_OK) != 0) + ; + else + errno = errsv; + + warn(_("cannot find an unused loop device")); + } else + printf("%s\n", loopcxt_get_device(&lc)); + break; + case A_SHOW: + if (list) + res = show_table(&lc, file, offset, flags); + else + res = show_all_loops(&lc, file, offset, flags); + break; + case A_SHOW_ONE: + if (list) + res = show_table(&lc, NULL, 0, 0); + else + res = printf_loopdev(&lc); + if (res) + warn("%s", loopcxt_get_device(&lc)); + break; + case A_SET_CAPACITY: + res = loopcxt_set_capacity(&lc); + if (res) + warn(_("%s: set capacity failed"), + loopcxt_get_device(&lc)); + break; + case A_SET_DIRECT_IO: + case A_SET_BLOCKSIZE: + lo_set_post: + if (set_dio) { + res = loopcxt_set_dio(&lc, use_dio); + if (res) + warn(_("%s: set direct io failed"), + loopcxt_get_device(&lc)); + } + if (set_blocksize) { + res = loopcxt_set_blocksize(&lc, blocksize); + if (res) + warn(_("%s: set logical block size failed"), + loopcxt_get_device(&lc)); + } + break; + default: + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + break; + } + + loopcxt_deinit(&lc); + return res ? EXIT_FAILURE : EXIT_SUCCESS; +} + diff --git a/sys-utils/lscpu-arm.c b/sys-utils/lscpu-arm.c new file mode 100644 index 0000000..37b8f66 --- /dev/null +++ b/sys-utils/lscpu-arm.c @@ -0,0 +1,252 @@ +/* + * lscpu-arm.c - ARM CPU identification tables + * + * Copyright (C) 2018 Riku Voipio <riku.voipio@iki.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * The information here is gathered from + * - ARM manuals + * - Linux kernel: arch/armX/include/asm/cputype.h + * - GCC sources: config/arch/arch-cores.def + * - Ancient wisdom + */ +#include "lscpu.h" + +struct id_part { + const int id; + const char* name; +}; + +static const struct id_part arm_part[] = { + { 0x810, "ARM810" }, + { 0x920, "ARM920" }, + { 0x922, "ARM922" }, + { 0x926, "ARM926" }, + { 0x940, "ARM940" }, + { 0x946, "ARM946" }, + { 0x966, "ARM966" }, + { 0xa20, "ARM1020" }, + { 0xa22, "ARM1022" }, + { 0xa26, "ARM1026" }, + { 0xb02, "ARM11 MPCore" }, + { 0xb36, "ARM1136" }, + { 0xb56, "ARM1156" }, + { 0xb76, "ARM1176" }, + { 0xc05, "Cortex-A5" }, + { 0xc07, "Cortex-A7" }, + { 0xc08, "Cortex-A8" }, + { 0xc09, "Cortex-A9" }, + { 0xc0d, "Cortex-A17" }, /* Originally A12 */ + { 0xc0f, "Cortex-A15" }, + { 0xc0e, "Cortex-A17" }, + { 0xc14, "Cortex-R4" }, + { 0xc15, "Cortex-R5" }, + { 0xc17, "Cortex-R7" }, + { 0xc18, "Cortex-R8" }, + { 0xc20, "Cortex-M0" }, + { 0xc21, "Cortex-M1" }, + { 0xc23, "Cortex-M3" }, + { 0xc24, "Cortex-M4" }, + { 0xc27, "Cortex-M7" }, + { 0xc60, "Cortex-M0+" }, + { 0xd01, "Cortex-A32" }, + { 0xd03, "Cortex-A53" }, + { 0xd04, "Cortex-A35" }, + { 0xd05, "Cortex-A55" }, + { 0xd07, "Cortex-A57" }, + { 0xd08, "Cortex-A72" }, + { 0xd09, "Cortex-A73" }, + { 0xd0a, "Cortex-A75" }, + { 0xd13, "Cortex-R52" }, + { 0xd20, "Cortex-M23" }, + { 0xd21, "Cortex-M33" }, + { -1, "unknown" }, +}; + +static const struct id_part brcm_part[] = { + { 0x0f, "Brahma B15" }, + { 0x100, "Brahma B53" }, + { 0x516, "ThunderX2" }, + { -1, "unknown" }, +}; + +static const struct id_part dec_part[] = { + { 0xa10, "SA110" }, + { 0xa11, "SA1100" }, + { -1, "unknown" }, +}; + +static const struct id_part cavium_part[] = { + { 0x0a0, "ThunderX" }, + { 0x0a1, "ThunderX 88XX" }, + { 0x0a2, "ThunderX 81XX" }, + { 0x0a3, "ThunderX 83XX" }, + { 0x0af, "ThunderX2 99xx" }, + { -1, "unknown" }, +}; + +static const struct id_part apm_part[] = { + { 0x000, "X-Gene" }, + { -1, "unknown" }, +}; + +static const struct id_part qcom_part[] = { + { 0x00f, "Scorpion" }, + { 0x02d, "Scorpion" }, + { 0x04d, "Krait" }, + { 0x06f, "Krait" }, + { 0x201, "Kryo" }, + { 0x205, "Kryo" }, + { 0x211, "Kryo" }, + { 0x800, "Falkor V1/Kryo" }, + { 0x801, "Kryo V2" }, + { 0xc00, "Falkor" }, + { 0xc01, "Saphira" }, + { -1, "unknown" }, +}; + +static const struct id_part samsung_part[] = { + { 0x001, "exynos-m1" }, + { -1, "unknown" }, +}; + +static const struct id_part nvidia_part[] = { + { 0x000, "Denver" }, + { 0x003, "Denver 2" }, + { -1, "unknown" }, +}; + +static const struct id_part marvell_part[] = { + { 0x131, "Feroceon 88FR131" }, + { 0x581, "PJ4/PJ4b" }, + { 0x584, "PJ4B-MP" }, + { -1, "unknown" }, +}; + +static const struct id_part faraday_part[] = { + { 0x526, "FA526" }, + { 0x626, "FA626" }, + { -1, "unknown" }, +}; + +static const struct id_part intel_part[] = { + { 0x200, "i80200" }, + { 0x210, "PXA250A" }, + { 0x212, "PXA210A" }, + { 0x242, "i80321-400" }, + { 0x243, "i80321-600" }, + { 0x290, "PXA250B/PXA26x" }, + { 0x292, "PXA210B" }, + { 0x2c2, "i80321-400-B0" }, + { 0x2c3, "i80321-600-B0" }, + { 0x2d0, "PXA250C/PXA255/PXA26x" }, + { 0x2d2, "PXA210C" }, + { 0x411, "PXA27x" }, + { 0x41c, "IPX425-533" }, + { 0x41d, "IPX425-400" }, + { 0x41f, "IPX425-266" }, + { 0x682, "PXA32x" }, + { 0x683, "PXA930/PXA935" }, + { 0x688, "PXA30x" }, + { 0x689, "PXA31x" }, + { 0xb11, "SA1110" }, + { 0xc12, "IPX1200" }, + { -1, "unknown" }, +}; + +static const struct id_part unknown_part[] = { + { -1, "unknown" }, +}; + +struct hw_impl { + const int id; + const struct id_part *parts; + const char *name; +}; + +static const struct hw_impl hw_implementer[] = { + { 0x41, arm_part, "ARM" }, + { 0x42, brcm_part, "Broadcom" }, + { 0x43, cavium_part, "Cavium" }, + { 0x44, dec_part, "DEC" }, + { 0x4e, nvidia_part, "Nvidia" }, + { 0x50, apm_part, "APM" }, + { 0x51, qcom_part, "Qualcomm" }, + { 0x53, samsung_part, "Samsung" }, + { 0x56, marvell_part, "Marvell" }, + { 0x66, faraday_part, "Faraday" }, + { 0x69, intel_part, "Intel" }, + { -1, unknown_part, "unknown" }, +}; + +void arm_cpu_decode(struct lscpu_desc *desc) +{ + int j, impl, part; + const struct id_part *parts = NULL; + char *end; + + if (desc->vendor == NULL || desc->model == NULL) + return; + if ((strncmp(desc->vendor,"0x",2) || strncmp(desc->model,"0x",2) )) + return; + + errno = 0; + impl = (int) strtol(desc->vendor, &end, 0); + if (errno || desc->vendor == end) + return; + + errno = 0; + part = (int) strtol(desc->model, &end, 0); + if (errno || desc->model == end) + return; + + for (j = 0; hw_implementer[j].id != -1; j++) { + if (hw_implementer[j].id == impl) { + parts = hw_implementer[j].parts; + desc->vendor = (char *) hw_implementer[j].name; + break; + } + } + + if (parts == NULL) + return; + + for (j = 0; parts[j].id != -1; j++) { + if (parts[j].id == part) { + desc->modelname = (char *) parts[j].name; + break; + } + } + + /* Print out the rXpY string for ARM cores */ + if (impl == 0x41 && desc->revision && desc->stepping) { + int revision, variant; + char buf[8]; + + errno = 0; + revision = (int) strtol(desc->revision, &end, 10); + if (errno || desc->revision == end) + return; + + errno = 0; + variant = (int) strtol(desc->stepping, &end, 0); + if (errno || desc->stepping == end) + return; + + snprintf(buf, sizeof(buf), "r%dp%d", variant, revision); + desc->stepping = xstrdup(buf); + } +} diff --git a/sys-utils/lscpu-dmi.c b/sys-utils/lscpu-dmi.c new file mode 100644 index 0000000..29bd2e4 --- /dev/null +++ b/sys-utils/lscpu-dmi.c @@ -0,0 +1,305 @@ +/* + * lscpu-dmi - Module to parse SMBIOS information + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Code originally taken from the dmidecode utility and slightly rewritten + * to suite the needs of lscpu + */ +#include <errno.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <stdio.h> + +#include "lscpu.h" + +#define _PATH_SYS_DMI "/sys/firmware/dmi/tables/DMI" + +#define WORD(x) (uint16_t)(*(const uint16_t *)(x)) +#define DWORD(x) (uint32_t)(*(const uint32_t *)(x)) + +struct dmi_header +{ + uint8_t type; + uint8_t length; + uint16_t handle; + uint8_t *data; +}; + +static int checksum(const uint8_t *buf, size_t len) +{ + uint8_t sum = 0; + size_t a; + + for (a = 0; a < len; a++) + sum += buf[a]; + return (sum == 0); +} + +static void *get_mem_chunk(size_t base, size_t len, const char *devmem) +{ + void *p = NULL; + int fd; + + if ((fd = open(devmem, O_RDONLY)) < 0) + return NULL; + + if (!(p = malloc(len))) + goto nothing; + if (lseek(fd, base, SEEK_SET) == -1) + goto nothing; + if (read_all(fd, p, len) == -1) + goto nothing; + + close(fd); + return p; + +nothing: + free(p); + close(fd); + return NULL; +} + +static void to_dmi_header(struct dmi_header *h, uint8_t *data) +{ + h->type = data[0]; + h->length = data[1]; + h->handle = WORD(data + 2); + h->data = data; +} + +static char *dmi_string(const struct dmi_header *dm, uint8_t s) +{ + char *bp = (char *)dm->data; + + if (s == 0) + return NULL; + + bp += dm->length; + while (s > 1 && *bp) + { + bp += strlen(bp); + bp++; + s--; + } + + if (!*bp) + return NULL; + + return bp; +} + +static int hypervisor_from_dmi_table(uint32_t base, uint16_t len, + uint16_t num, const char *devmem) +{ + uint8_t *buf; + uint8_t *data; + int i = 0; + char *vendor = NULL; + char *product = NULL; + char *manufacturer = NULL; + int rc = HYPER_NONE; + + data = buf = get_mem_chunk(base, len, devmem); + if (!buf) + goto done; + + /* 4 is the length of an SMBIOS structure header */ + while (i < num && data + 4 <= buf + len) { + uint8_t *next; + struct dmi_header h; + + to_dmi_header(&h, data); + + /* + * If a short entry is found (less than 4 bytes), not only it + * is invalid, but we cannot reliably locate the next entry. + * Better stop at this point. + */ + if (h.length < 4) + goto done; + + /* look for the next handle */ + next = data + h.length; + while (next - buf + 1 < len && (next[0] != 0 || next[1] != 0)) + next++; + next += 2; + switch (h.type) { + case 0: + vendor = dmi_string(&h, data[0x04]); + break; + case 1: + manufacturer = dmi_string(&h, data[0x04]); + product = dmi_string(&h, data[0x05]); + break; + default: + break; + } + + data = next; + i++; + } + if (manufacturer && !strcmp(manufacturer, "innotek GmbH")) + rc = HYPER_INNOTEK; + else if (manufacturer && strstr(manufacturer, "HITACHI") && + product && strstr(product, "LPAR")) + rc = HYPER_HITACHI; + else if (vendor && !strcmp(vendor, "Parallels")) + rc = HYPER_PARALLELS; +done: + free(buf); + return rc; +} + +#if defined(__x86_64__) || defined(__i386__) +static int hypervisor_decode_legacy(uint8_t *buf, const char *devmem) +{ + if (!checksum(buf, 0x0F)) + return -1; + + return hypervisor_from_dmi_table(DWORD(buf + 0x08), WORD(buf + 0x06), + WORD(buf + 0x0C), + devmem); +} +#endif + +static int hypervisor_decode_smbios(uint8_t *buf, const char *devmem) +{ + if (!checksum(buf, buf[0x05]) + || memcmp(buf + 0x10, "_DMI_", 5) != 0 + || !checksum(buf + 0x10, 0x0F)) + return -1; + + return hypervisor_from_dmi_table(DWORD(buf + 0x18), WORD(buf + 0x16), + WORD(buf + 0x1C), + devmem); +} + +static int hypervisor_decode_sysfw(void) +{ + static char const sys_fw_dmi_tables[] = _PATH_SYS_DMI; + struct stat st; + + if (stat(sys_fw_dmi_tables, &st)) + return -1; + + return hypervisor_from_dmi_table(0, st.st_size, st.st_size / 4, + sys_fw_dmi_tables); +} + +/* + * Probe for EFI interface + */ +#define EFI_NOT_FOUND (-1) +#define EFI_NO_SMBIOS (-2) +static int address_from_efi(size_t *address) +{ + FILE *tab; + char linebuf[64]; + int ret; + + *address = 0; /* Prevent compiler warning */ + + /* + * Linux up to 2.6.6: /proc/efi/systab + * Linux 2.6.7 and up: /sys/firmware/efi/systab + */ + if (!(tab = fopen("/sys/firmware/efi/systab", "r")) && + !(tab = fopen("/proc/efi/systab", "r"))) + return EFI_NOT_FOUND; /* No EFI interface */ + + ret = EFI_NO_SMBIOS; + while ((fgets(linebuf, sizeof(linebuf) - 1, tab)) != NULL) { + char *addrp = strchr(linebuf, '='); + if (!addrp) + continue; + *(addrp++) = '\0'; + if (strcmp(linebuf, "SMBIOS") == 0) { + *address = strtoul(addrp, NULL, 0); + ret = 0; + break; + } + } + + fclose(tab); + return ret; +} + +int read_hypervisor_dmi(void) +{ + int rc = HYPER_NONE; + uint8_t *buf = NULL; + size_t fp = 0; + + if (sizeof(uint8_t) != 1 + || sizeof(uint16_t) != 2 + || sizeof(uint32_t) != 4 + || '\0' != 0) + goto done; + + /* -1 : no DMI in /sys, + * 0 : DMI exist, nothing detected (HYPER_NONE) + * >0 : hypervisor detected + */ + rc = hypervisor_decode_sysfw(); + if (rc >= HYPER_NONE) + goto done; + + /* First try EFI (ia64, Intel-based Mac) */ + switch (address_from_efi(&fp)) { + case EFI_NOT_FOUND: + goto memory_scan; + case EFI_NO_SMBIOS: + goto done; + } + + buf = get_mem_chunk(fp, 0x20, _PATH_DEV_MEM); + if (!buf) + goto done; + + rc = hypervisor_decode_smbios(buf, _PATH_DEV_MEM); + if (rc >= HYPER_NONE) + goto done; + + free(buf); + buf = NULL; +memory_scan: +#if defined(__x86_64__) || defined(__i386__) + /* Fallback to memory scan (x86, x86_64) */ + buf = get_mem_chunk(0xF0000, 0x10000, _PATH_DEV_MEM); + if (!buf) + goto done; + + for (fp = 0; fp <= 0xFFF0; fp += 16) { + if (memcmp(buf + fp, "_SM_", 4) == 0 && fp <= 0xFFE0) { + rc = hypervisor_decode_smbios(buf + fp, _PATH_DEV_MEM); + if (rc < 0) + fp += 16; + + } else if (memcmp(buf + fp, "_DMI_", 5) == 0) + rc = hypervisor_decode_legacy(buf + fp, _PATH_DEV_MEM); + + if (rc >= HYPER_NONE) + break; + } +#endif +done: + free(buf); + return rc < 0 ? HYPER_NONE : rc; +} diff --git a/sys-utils/lscpu.1 b/sys-utils/lscpu.1 new file mode 100644 index 0000000..23dee9b --- /dev/null +++ b/sys-utils/lscpu.1 @@ -0,0 +1,184 @@ +.TH LSCPU 1 "November 2015" "util-linux" "User Commands" +.SH NAME +lscpu \- display information about the CPU architecture +.SH SYNOPSIS +.B lscpu +.RB [ \-a | \-b | \-c | \-J "] [" \-x "] [" \-y "] [" \-s " \fIdirectory\fP] [" \-e [=\fIlist\fP]| \-p [=\fIlist\fP]] +.br +.B lscpu +.BR \-h | \-V +.SH DESCRIPTION +.B lscpu +gathers CPU architecture information from sysfs, /proc/cpuinfo and any +applicable architecture-specific libraries (e.g.\& librtas on Powerpc). The +command output can be optimized for parsing or for easy readability by humans. +The information includes, for example, the number of CPUs, threads, cores, +sockets, and Non-Uniform Memory Access (NUMA) nodes. There is also information +about the CPU caches and cache sharing, family, model, bogoMIPS, byte order, +and stepping. +.sp +In virtualized environments, the CPU architecture information displayed +reflects the configuration of the guest operating system which is +typically different from the physical (host) system. On architectures that +support retrieving physical topology information, +.B lscpu +also displays the number of physical sockets, chips, cores in the host system. +.sp +Options that result in an output table have a \fIlist\fP argument. Use this +argument to customize the command output. Specify a comma-separated list of +column labels to limit the output table to only the specified columns, arranged +in the specified order. See \fBCOLUMNS\fP for a list of valid column labels. The +column labels are not case sensitive. +.sp +Not all columns are supported on all architectures. If an unsupported column is +specified, \fBlscpu\fP prints the column but does not provide any data for it. + +.SS COLUMNS +Note that topology elements (core, socket, etc.) use a sequential unique ID +starting from zero, but CPU logical numbers follow the kernel where there is +no guarantee of sequential numbering. +.TP +.B CPU +The logical CPU number of a CPU as used by the Linux kernel. +.TP +.B CORE +The logical core number. A core can contain several CPUs. +.TP +.B SOCKET +The logical socket number. A socket can contain several cores. +.TP +.B BOOK +The logical book number. A book can contain several sockets. +.TP +.B DRAWER +The logical drawer number. A drawer can contain several books. +.TP +.B NODE +The logical NUMA node number. A node can contain several drawers. +.TP +.B CACHE +Information about how caches are shared between CPUs. +.TP +.B ADDRESS +The physical address of a CPU. +.TP +.B ONLINE +Indicator that shows whether the Linux instance currently makes use of the CPU. +.TP +.B CONFIGURED +Indicator that shows if the hypervisor has allocated the CPU to the virtual +hardware on which the Linux instance runs. CPUs that are configured can be set +online by the Linux instance. +This column contains data only if your hardware system and hypervisor support +dynamic CPU resource allocation. +.TP +.B POLARIZATION +This column contains data for Linux instances that run on virtual hardware with +a hypervisor that can switch the CPU dispatching mode (polarization). The +polarization can be: +.RS +.TP 12 +.B horizontal\fP +The workload is spread across all available CPUs. +.TP 12 +.B vertical +The workload is concentrated on few CPUs. +.P +For vertical polarization, the column also shows the degree of concentration, +high, medium, or low. This column contains data only if your hardware system +and hypervisor support CPU polarization. +.RE +.TP +.B MAXMHZ +Maximum megahertz value for the CPU. Useful when \fBlscpu\fP is used as hardware +inventory information gathering tool. Notice that the megahertz value is +dynamic, and driven by CPU governor depending on current resource need. +.TP +.B MINMHZ +Minimum megahertz value for the CPU. +.SH OPTIONS +.TP +.BR \-a , " \-\-all" +Include lines for online and offline CPUs in the output (default for \fB-e\fR). +This option may only be specified together with option \fB-e\fR or \fB-p\fR. +.TP +.BR \-b , " \-\-online" +Limit the output to online CPUs (default for \fB-p\fR). +This option may only be specified together with option \fB-e\fR or \fB-p\fR. +.TP +.BR \-c , " \-\-offline" +Limit the output to offline CPUs. +This option may only be specified together with option \fB-e\fR or \fB-p\fR. +.TP +.BR \-e , " \-\-extended" [=\fIlist\fP] +Display the CPU information in human-readable format. + +If the \fIlist\fP argument is omitted, all columns for which data is available +are included in the command output. + +When specifying the \fIlist\fP argument, the string of option, equal sign (=), and +\fIlist\fP must not contain any blanks or other whitespace. +Examples: '\fB-e=cpu,node\fP' or '\fB--extended=cpu,node\fP'. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.TP +.BR \-J , " \-\-json" +Use JSON output format for the default summary or extended output (see \fB\-\-extended\fP). +.TP +.BR \-p , " \-\-parse" [=\fIlist\fP] +Optimize the command output for easy parsing. + +If the \fIlist\fP argument is omitted, the command output is compatible with earlier +versions of \fBlscpu\fP. In this compatible format, two commas are used to separate +CPU cache columns. If no CPU caches are identified the cache column is omitted. +.br +If the \fIlist\fP argument is used, cache columns are separated with a colon (:). + +When specifying the \fIlist\fP argument, the string of option, equal sign (=), and +\fIlist\fP must not contain any blanks or other whitespace. +Examples: '\fB-p=cpu,node\fP' or '\fB--parse=cpu,node\fP'. +.TP +.BR \-s , " \-\-sysroot " \fIdirectory\fP +Gather CPU data for a Linux instance other than the instance from which the +\fBlscpu\fP command is issued. The specified \fIdirectory\fP is the system root +of the Linux instance to be inspected. +.TP +.BR \-x , " \-\-hex" +Use hexadecimal masks for CPU sets (for example 0x3). The default is to print +the sets in list format (for example 0,1). +.TP +.BR \-y , " \-\-physical" +Display physical IDs for all columns with topology elements (core, socket, etc.). +Other than logical IDs, which are assigned by \fBlscpu\fP, physical IDs are +platform-specific values that are provided by the kernel. Physical IDs are not +necessarily unique and they might not be arranged sequentially. +If the kernel could not retrieve a physical ID for an element \fBlscpu\fP prints +the dash (-) character. + +The CPU logical numbers are not affected by this option. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.B \-\-output\-all +Output all available columns. This option must be combined with either +.BR \-\-extended " or " \-\-parse . +.SH BUGS +The basic overview of CPU family, model, etc. is always based on the first +CPU only. + +Sometimes in Xen Dom0 the kernel reports wrong data. + +On virtual hardware the number of cores per socket, etc. can be wrong. +.SH AUTHOR +.nf +Cai Qian <qcai@redhat.com> +Karel Zak <kzak@redhat.com> +Heiko Carstens <heiko.carstens@de.ibm.com> +.fi +.SH "SEE ALSO" +.BR chcpu (8) +.SH AVAILABILITY +The lscpu command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/lscpu.c b/sys-utils/lscpu.c new file mode 100644 index 0000000..1ff9069 --- /dev/null +++ b/sys-utils/lscpu.c @@ -0,0 +1,2134 @@ +/* + * lscpu - CPU architecture information helper + * + * Copyright (C) 2008 Cai Qian <qcai@redhat.com> + * Copyright (C) 2008 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <assert.h> +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/utsname.h> +#include <unistd.h> +#include <stdarg.h> +#include <sys/types.h> +#include <sys/stat.h> + +#if (defined(__x86_64__) || defined(__i386__)) +# if !defined( __SANITIZE_ADDRESS__) +# define INCLUDE_VMWARE_BDOOR +# else +# warning VMWARE detection disabled by __SANITIZE_ADDRESS__ +# endif +#endif + +#ifdef INCLUDE_VMWARE_BDOOR +# include <stdint.h> +# include <signal.h> +# include <strings.h> +# include <setjmp.h> +# ifdef HAVE_SYS_IO_H +# include <sys/io.h> +# endif +#endif + +#if defined(HAVE_LIBRTAS) +#include <librtas.h> +#endif + +#include <libsmartcols.h> + +#include "closestream.h" +#include "optutils.h" + +#include "lscpu.h" + +#define CACHE_MAX 100 + +/* /sys paths */ +#define _PATH_SYS_SYSTEM "/sys/devices/system" +#define _PATH_SYS_HYP_FEATURES "/sys/hypervisor/properties/features" +#define _PATH_SYS_CPU _PATH_SYS_SYSTEM "/cpu" +#define _PATH_SYS_NODE _PATH_SYS_SYSTEM "/node" + +/* Xen Domain feature flag used for /sys/hypervisor/properties/features */ +#define XENFEAT_supervisor_mode_kernel 3 +#define XENFEAT_mmu_pt_update_preserve_ad 5 +#define XENFEAT_hvm_callback_vector 8 + +#define XEN_FEATURES_PV_MASK (1U << XENFEAT_mmu_pt_update_preserve_ad) +#define XEN_FEATURES_PVH_MASK ( (1U << XENFEAT_supervisor_mode_kernel) \ + | (1U << XENFEAT_hvm_callback_vector) ) + +static const char *virt_types[] = { + [VIRT_NONE] = N_("none"), + [VIRT_PARA] = N_("para"), + [VIRT_FULL] = N_("full"), + [VIRT_CONT] = N_("container"), +}; + +static const char *hv_vendors[] = { + [HYPER_NONE] = NULL, + [HYPER_XEN] = "Xen", + [HYPER_KVM] = "KVM", + [HYPER_MSHV] = "Microsoft", + [HYPER_VMWARE] = "VMware", + [HYPER_IBM] = "IBM", + [HYPER_VSERVER] = "Linux-VServer", + [HYPER_UML] = "User-mode Linux", + [HYPER_INNOTEK] = "Innotek GmbH", + [HYPER_HITACHI] = "Hitachi", + [HYPER_PARALLELS] = "Parallels", + [HYPER_VBOX] = "Oracle", + [HYPER_OS400] = "OS/400", + [HYPER_PHYP] = "pHyp", + [HYPER_SPAR] = "Unisys s-Par", + [HYPER_WSL] = "Windows Subsystem for Linux" +}; + +static const int hv_vendor_pci[] = { + [HYPER_NONE] = 0x0000, + [HYPER_XEN] = 0x5853, + [HYPER_KVM] = 0x0000, + [HYPER_MSHV] = 0x1414, + [HYPER_VMWARE] = 0x15ad, + [HYPER_VBOX] = 0x80ee, +}; + +static const int hv_graphics_pci[] = { + [HYPER_NONE] = 0x0000, + [HYPER_XEN] = 0x0001, + [HYPER_KVM] = 0x0000, + [HYPER_MSHV] = 0x5353, + [HYPER_VMWARE] = 0x0710, + [HYPER_VBOX] = 0xbeef, +}; + + +/* dispatching modes */ +static const char *disp_modes[] = { + [DISP_HORIZONTAL] = N_("horizontal"), + [DISP_VERTICAL] = N_("vertical") +}; + +static struct polarization_modes polar_modes[] = { + [POLAR_UNKNOWN] = {"U", "-"}, + [POLAR_VLOW] = {"VL", "vert-low"}, + [POLAR_VMEDIUM] = {"VM", "vert-medium"}, + [POLAR_VHIGH] = {"VH", "vert-high"}, + [POLAR_HORIZONTAL] = {"H", "horizontal"}, +}; + +static int maxcpus; /* size in bits of kernel cpu mask */ + +#define is_cpu_online(_d, _cpu) \ + ((_d) && (_d)->online ? \ + CPU_ISSET_S((_cpu), CPU_ALLOC_SIZE(maxcpus), (_d)->online) : 0) +#define is_cpu_present(_d, _cpu) \ + ((_d) && (_d)->present ? \ + CPU_ISSET_S((_cpu), CPU_ALLOC_SIZE(maxcpus), (_d)->present) : 0) + +#define real_cpu_num(_d, _i) ((_d)->idx2cpunum[(_i)]) + +/* + * IDs + */ +enum { + COL_CPU, + COL_CORE, + COL_SOCKET, + COL_NODE, + COL_BOOK, + COL_DRAWER, + COL_CACHE, + COL_POLARIZATION, + COL_ADDRESS, + COL_CONFIGURED, + COL_ONLINE, + COL_MAXMHZ, + COL_MINMHZ, +}; + +/* column description + */ +struct lscpu_coldesc { + const char *name; + const char *help; + + unsigned int is_abbr:1; /* name is abbreviation */ +}; + +static struct lscpu_coldesc coldescs[] = +{ + [COL_CPU] = { "CPU", N_("logical CPU number"), 1 }, + [COL_CORE] = { "CORE", N_("logical core number") }, + [COL_SOCKET] = { "SOCKET", N_("logical socket number") }, + [COL_NODE] = { "NODE", N_("logical NUMA node number") }, + [COL_BOOK] = { "BOOK", N_("logical book number") }, + [COL_DRAWER] = { "DRAWER", N_("logical drawer number") }, + [COL_CACHE] = { "CACHE", N_("shows how caches are shared between CPUs") }, + [COL_POLARIZATION] = { "POLARIZATION", N_("CPU dispatching mode on virtual hardware") }, + [COL_ADDRESS] = { "ADDRESS", N_("physical address of a CPU") }, + [COL_CONFIGURED] = { "CONFIGURED", N_("shows if the hypervisor has allocated the CPU") }, + [COL_ONLINE] = { "ONLINE", N_("shows if Linux currently makes use of the CPU") }, + [COL_MAXMHZ] = { "MAXMHZ", N_("shows the maximum MHz of the CPU") }, + [COL_MINMHZ] = { "MINMHZ", N_("shows the minimum MHz of the CPU") } +}; + +static int +column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(coldescs); i++) { + const char *cn = coldescs[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +/* Lookup a pattern and get the value from cpuinfo. + * Format is: + * + * "<pattern> : <key>" + */ +static int +lookup(char *line, char *pattern, char **value) +{ + char *p, *v; + int len = strlen(pattern); + + /* don't re-fill already found tags, first one wins */ + if (!*line || *value) + return 0; + + /* pattern */ + if (strncmp(line, pattern, len)) + return 0; + + /* white spaces */ + for (p = line + len; isspace(*p); p++); + + /* separator */ + if (*p != ':') + return 0; + + /* white spaces */ + for (++p; isspace(*p); p++); + + /* value */ + if (!*p) + return 0; + v = p; + + /* end of value */ + len = strlen(line) - 1; + for (p = line + len; isspace(*(p-1)); p--); + *p = '\0'; + + *value = xstrdup(v); + return 1; +} + +/* Parse extra cache lines contained within /proc/cpuinfo but which are not + * part of the cache topology information within the sysfs filesystem. + * This is true for all shared caches on e.g. s390. When there are layers of + * hypervisors in between it is not knows which CPUs share which caches. + * Therefore information about shared caches is only available in + * /proc/cpuinfo. + * Format is: + * "cache<nr> : level=<lvl> type=<type> scope=<scope> size=<size> line_size=<lsz> associativity=<as>" + */ +static int +lookup_cache(char *line, struct lscpu_desc *desc) +{ + struct cpu_cache *cache; + long long size; + char *p, type; + int level; + + /* Make sure line starts with "cache<nr> :" */ + if (strncmp(line, "cache", 5)) + return 0; + for (p = line + 5; isdigit(*p); p++); + for (; isspace(*p); p++); + if (*p != ':') + return 0; + + p = strstr(line, "scope=") + 6; + /* Skip private caches, also present in sysfs */ + if (!p || strncmp(p, "Private", 7) == 0) + return 0; + p = strstr(line, "level="); + if (!p || sscanf(p, "level=%d", &level) != 1) + return 0; + p = strstr(line, "type=") + 5; + if (!p || !*p) + return 0; + type = 0; + if (strncmp(p, "Data", 4) == 0) + type = 'd'; + if (strncmp(p, "Instruction", 11) == 0) + type = 'i'; + p = strstr(line, "size="); + if (!p || sscanf(p, "size=%lld", &size) != 1) + return 0; + + desc->necaches++; + desc->ecaches = xrealloc(desc->ecaches, + desc->necaches * sizeof(struct cpu_cache)); + cache = &desc->ecaches[desc->necaches - 1]; + memset(cache, 0 , sizeof(*cache)); + if (type) + xasprintf(&cache->name, "L%d%c", level, type); + else + xasprintf(&cache->name, "L%d", level); + xasprintf(&cache->size, "%lldK", size); + return 1; +} + +/* Don't init the mode for platforms where we are not able to + * detect that CPU supports 64-bit mode. + */ +static int +init_mode(struct lscpu_modifier *mod) +{ + int m = 0; + + if (mod->system == SYSTEM_SNAPSHOT) + /* reading info from any /{sys,proc} dump, don't mix it with + * information about our real CPU */ + return 0; + +#if defined(__alpha__) || defined(__ia64__) + m |= MODE_64BIT; /* 64bit platforms only */ +#endif + /* platforms with 64bit flag in /proc/cpuinfo, define + * 32bit default here */ +#if defined(__i386__) || defined(__x86_64__) || \ + defined(__s390x__) || defined(__s390__) || defined(__sparc_v9__) + m |= MODE_32BIT; +#endif + return m; +} + +#if defined(HAVE_LIBRTAS) +#define PROCESSOR_MODULE_INFO 43 +static int strbe16toh(const char *buf, int offset) +{ + return (buf[offset] << 8) + buf[offset+1]; +} + +static void read_physical_info_powerpc(struct lscpu_desc *desc) +{ + char buf[BUFSIZ]; + int rc, len, ntypes; + + desc->physsockets = desc->physchips = desc->physcoresperchip = 0; + + rc = rtas_get_sysparm(PROCESSOR_MODULE_INFO, sizeof(buf), buf); + if (rc < 0) + return; + + len = strbe16toh(buf, 0); + if (len < 8) + return; + + ntypes = strbe16toh(buf, 2); + + assert(ntypes <= 1); + if (!ntypes) + return; + + desc->physsockets = strbe16toh(buf, 4); + desc->physchips = strbe16toh(buf, 6); + desc->physcoresperchip = strbe16toh(buf, 8); +} +#else +static void read_physical_info_powerpc( + struct lscpu_desc *desc __attribute__((__unused__))) +{ +} +#endif + + +static void +read_basicinfo(struct lscpu_desc *desc, struct lscpu_modifier *mod) +{ + FILE *fp; + char buf[BUFSIZ]; + struct utsname utsbuf; + size_t setsize; + cpu_set_t *cpuset = NULL; + + /* architecture */ + if (uname(&utsbuf) == -1) + err(EXIT_FAILURE, _("error: uname failed")); + + fp = ul_path_fopen(desc->procfs, "r", "cpuinfo"); + if (!fp) + err(EXIT_FAILURE, _("cannot open %s"), "/proc/cpuinfo"); + desc->arch = xstrdup(utsbuf.machine); + + /* details */ + while (fgets(buf, sizeof(buf), fp) != NULL) { + if (lookup(buf, "vendor", &desc->vendor)) ; + else if (lookup(buf, "vendor_id", &desc->vendor)) ; + else if (lookup(buf, "CPU implementer", &desc->vendor)) ; /* ARM and aarch64 */ + else if (lookup(buf, "family", &desc->family)) ; + else if (lookup(buf, "cpu family", &desc->family)) ; + else if (lookup(buf, "model", &desc->model)) ; + else if (lookup(buf, "CPU part", &desc->model)) ; /* ARM and aarch64 */ + else if (lookup(buf, "model name", &desc->modelname)) ; + else if (lookup(buf, "stepping", &desc->stepping)) ; + else if (lookup(buf, "CPU variant", &desc->stepping)) ; /* aarch64 */ + else if (lookup(buf, "cpu MHz", &desc->mhz)) ; + else if (lookup(buf, "cpu MHz dynamic", &desc->dynamic_mhz)) ; /* s390 */ + else if (lookup(buf, "cpu MHz static", &desc->static_mhz)) ; /* s390 */ + else if (lookup(buf, "flags", &desc->flags)) ; /* x86 */ + else if (lookup(buf, "features", &desc->flags)) ; /* s390 */ + else if (lookup(buf, "Features", &desc->flags)) ; /* aarch64 */ + else if (lookup(buf, "type", &desc->flags)) ; /* sparc64 */ + else if (lookup(buf, "bogomips", &desc->bogomips)) ; + else if (lookup(buf, "BogoMIPS", &desc->bogomips)) ; /* aarch64 */ + else if (lookup(buf, "bogomips per cpu", &desc->bogomips)) ; /* s390 */ + else if (lookup(buf, "cpu", &desc->cpu)) ; + else if (lookup(buf, "revision", &desc->revision)) ; + else if (lookup(buf, "CPU revision", &desc->revision)) ; /* aarch64 */ + else if (lookup(buf, "max thread id", &desc->mtid)) ; /* s390 */ + else if (lookup(buf, "address sizes", &desc->addrsz)) ; /* x86 */ + else if (lookup_cache(buf, desc)) ; + else + continue; + } + + desc->mode = init_mode(mod); + + if (desc->flags) { + snprintf(buf, sizeof(buf), " %s ", desc->flags); + if (strstr(buf, " svm ")) + desc->virtflag = xstrdup("svm"); + else if (strstr(buf, " vmx ")) + desc->virtflag = xstrdup("vmx"); + if (strstr(buf, " lm ")) + desc->mode |= MODE_32BIT | MODE_64BIT; /* x86_64 */ + if (strstr(buf, " zarch ")) + desc->mode |= MODE_32BIT | MODE_64BIT; /* s390x */ + if (strstr(buf, " sun4v ") || strstr(buf, " sun4u ")) + desc->mode |= MODE_32BIT | MODE_64BIT; /* sparc64 */ + } + + if (desc->arch && mod->system != SYSTEM_SNAPSHOT) { + if (strcmp(desc->arch, "ppc64") == 0) + desc->mode |= MODE_32BIT | MODE_64BIT; + else if (strcmp(desc->arch, "ppc") == 0) + desc->mode |= MODE_32BIT; + } + + fclose(fp); + + if (ul_path_read_s32(desc->syscpu, &maxcpus, "kernel_max") == 0) + /* note that kernel_max is maximum index [NR_CPUS-1] */ + maxcpus += 1; + + else if (mod->system == SYSTEM_LIVE) + /* the root is '/' so we are working with data from the current kernel */ + maxcpus = get_max_number_of_cpus(); + + if (maxcpus <= 0) + /* error or we are reading some /sys snapshot instead of the + * real /sys, let's use any crazy number... */ + maxcpus = 2048; + + setsize = CPU_ALLOC_SIZE(maxcpus); + + if (ul_path_readf_cpulist(desc->syscpu, &cpuset, maxcpus, "possible") == 0) { + int num, idx; + + desc->ncpuspos = CPU_COUNT_S(setsize, cpuset); + desc->idx2cpunum = xcalloc(desc->ncpuspos, sizeof(int)); + + for (num = 0, idx = 0; num < maxcpus; num++) { + if (CPU_ISSET_S(num, setsize, cpuset)) + desc->idx2cpunum[idx++] = num; + } + cpuset_free(cpuset); + cpuset = NULL; + } else + err(EXIT_FAILURE, _("failed to determine number of CPUs: %s"), + _PATH_SYS_CPU "/possible"); + + + /* get mask for present CPUs */ + if (ul_path_readf_cpulist(desc->syscpu, &desc->present, maxcpus, "present") == 0) + desc->ncpus = CPU_COUNT_S(setsize, desc->present); + + /* get mask for online CPUs */ + if (ul_path_readf_cpulist(desc->syscpu, &desc->online, maxcpus, "online") == 0) + desc->nthreads = CPU_COUNT_S(setsize, desc->online); + + /* get dispatching mode */ + if (ul_path_read_s32(desc->syscpu, &desc->dispatching, "dispatching") != 0) + desc->dispatching = -1; + + if (mod->system == SYSTEM_LIVE) + read_physical_info_powerpc(desc); + + if ((fp = ul_path_fopen(desc->procfs, "r", "sysinfo"))) { + while (fgets(buf, sizeof(buf), fp) != NULL && !desc->machinetype) + lookup(buf, "Type", &desc->machinetype); + fclose(fp); + } +} + +static int +has_pci_device(struct lscpu_desc *desc, unsigned int vendor, unsigned int device) +{ + FILE *f; + unsigned int num, fn, ven, dev; + int res = 1; + + f = ul_path_fopen(desc->procfs, "r", "bus/pci/devices"); + if (!f) + return 0; + + /* for more details about bus/pci/devices format see + * drivers/pci/proc.c in linux kernel + */ + while(fscanf(f, "%02x%02x\t%04x%04x\t%*[^\n]", + &num, &fn, &ven, &dev) == 4) { + + if (ven == vendor && dev == device) + goto found; + } + + res = 0; +found: + fclose(f); + return res; +} + +#if defined(__x86_64__) || defined(__i386__) + +/* + * This CPUID leaf returns the information about the hypervisor. + * EAX : maximum input value for CPUID supported by the hypervisor. + * EBX, ECX, EDX : Hypervisor vendor ID signature. E.g. VMwareVMware. + */ +#define HYPERVISOR_INFO_LEAF 0x40000000 + +static inline void +cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + __asm__( +#if defined(__PIC__) && defined(__i386__) + /* x86 PIC cannot clobber ebx -- gcc bitches */ + "xchg %%ebx, %%esi;" + "cpuid;" + "xchg %%esi, %%ebx;" + : "=S" (*ebx), +#else + "cpuid;" + : "=b" (*ebx), +#endif + "=a" (*eax), + "=c" (*ecx), + "=d" (*edx) + : "1" (op), "c"(0)); +} + +static void +read_hypervisor_cpuid(struct lscpu_desc *desc) +{ + unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; + char hyper_vendor_id[13]; + + memset(hyper_vendor_id, 0, sizeof(hyper_vendor_id)); + + cpuid(HYPERVISOR_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + + if (!hyper_vendor_id[0]) + return; + + if (!strncmp("XenVMMXenVMM", hyper_vendor_id, 12)) + desc->hyper = HYPER_XEN; + else if (!strncmp("KVMKVMKVM", hyper_vendor_id, 9)) + desc->hyper = HYPER_KVM; + else if (!strncmp("Microsoft Hv", hyper_vendor_id, 12)) + desc->hyper = HYPER_MSHV; + else if (!strncmp("VMwareVMware", hyper_vendor_id, 12)) + desc->hyper = HYPER_VMWARE; + else if (!strncmp("UnisysSpar64", hyper_vendor_id, 12)) + desc->hyper = HYPER_SPAR; +} + +#else /* ! (__x86_64__ || __i386__) */ +static void +read_hypervisor_cpuid(struct lscpu_desc *desc __attribute__((__unused__))) +{ +} +#endif + +static int is_devtree_compatible(struct lscpu_desc *desc, const char *str) +{ + FILE *fd = ul_path_fopen(desc->procfs, "r", "device-tree/compatible"); + + if (fd) { + char buf[256]; + size_t i, len; + + memset(buf, 0, sizeof(buf)); + len = fread(buf, 1, sizeof(buf) - 1, fd); + fclose(fd); + + for (i = 0; i < len;) { + if (!strcmp(&buf[i], str)) + return 1; + i += strlen(&buf[i]); + i++; + } + } + + return 0; +} + +static int +read_hypervisor_powerpc(struct lscpu_desc *desc) +{ + assert(!desc->hyper); + + /* IBM iSeries: legacy, para-virtualized on top of OS/400 */ + if (ul_path_access(desc->procfs, F_OK, "iSeries") == 0) { + desc->hyper = HYPER_OS400; + desc->virtype = VIRT_PARA; + + /* PowerNV (POWER Non-Virtualized, bare-metal) */ + } else if (is_devtree_compatible(desc, "ibm,powernv")) { + desc->hyper = HYPER_NONE; + desc->virtype = VIRT_NONE; + + /* PowerVM (IBM's proprietary hypervisor, aka pHyp) */ + } else if (ul_path_access(desc->procfs, F_OK, "device-tree/ibm,partition-name") == 0 + && ul_path_access(desc->procfs, F_OK, "device-tree/hmc-managed?") == 0 + && ul_path_access(desc->procfs, F_OK, "device-tree/chosen/qemu,graphic-width") != 0) { + + FILE *fd; + desc->hyper = HYPER_PHYP; + desc->virtype = VIRT_PARA; + + fd = ul_path_fopen(desc->procfs, "r", "device-tree/ibm,partition-name"); + if (fd) { + char buf[256]; + if (fscanf(fd, "%255s", buf) == 1 && !strcmp(buf, "full")) + desc->virtype = VIRT_NONE; + fclose(fd); + } + + /* Qemu */ + } else if (is_devtree_compatible(desc, "qemu,pseries")) { + desc->hyper = HYPER_KVM; + desc->virtype = VIRT_PARA; + } + return desc->hyper; +} + +#ifdef INCLUDE_VMWARE_BDOOR + +#define VMWARE_BDOOR_MAGIC 0x564D5868 +#define VMWARE_BDOOR_PORT 0x5658 +#define VMWARE_BDOOR_CMD_GETVERSION 10 + +static UL_ASAN_BLACKLIST +void vmware_bdoor(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + __asm__( +#if defined(__PIC__) && defined(__i386__) + /* x86 PIC cannot clobber ebx -- gcc bitches */ + "xchg %%ebx, %%esi;" + "inl (%%dx), %%eax;" + "xchg %%esi, %%ebx;" + : "=S" (*ebx), +#else + "inl (%%dx), %%eax;" + : "=b" (*ebx), +#endif + "=a" (*eax), + "=c" (*ecx), + "=d" (*edx) + : "0" (VMWARE_BDOOR_MAGIC), + "1" (VMWARE_BDOOR_CMD_GETVERSION), + "2" (VMWARE_BDOOR_PORT), + "3" (0) + : "memory"); +} + +static jmp_buf segv_handler_env; + +static void +segv_handler(__attribute__((__unused__)) int sig, + __attribute__((__unused__)) siginfo_t *info, + __attribute__((__unused__)) void *ignored) +{ + siglongjmp(segv_handler_env, 1); +} + +static int +is_vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + struct sigaction act, oact; + + /* + * FIXME: Not reliable for non-root users. Note it works as expected if + * vmware_bdoor() is not optimized for PIE, but then it fails to build + * on 32bit x86 systems. See lscpu git log for more details (commit + * 7845b91dbc7690064a2be6df690e4aaba728fb04). kzak [3-Nov-2016] + */ + if (getuid() != 0) + return 0; + + /* + * The assembly routine for vmware detection works + * fine under vmware, even if ran as regular user. But + * on real HW or under other hypervisors, it segfaults (which is + * expected). So we temporarily install SIGSEGV handler to catch + * the signal. All this magic is needed because lscpu + * isn't supposed to require root privileges. + */ + if (sigsetjmp(segv_handler_env, 1)) + return 0; + + memset(&act, 0, sizeof(act)); + act.sa_sigaction = segv_handler; + act.sa_flags = SA_SIGINFO; + + if (sigaction(SIGSEGV, &act, &oact)) + err(EXIT_FAILURE, _("cannot set signal handler")); + + vmware_bdoor(&eax, &ebx, &ecx, &edx); + + if (sigaction(SIGSEGV, &oact, NULL)) + err(EXIT_FAILURE, _("cannot restore signal handler")); + + return eax != (uint32_t)-1 && ebx == VMWARE_BDOOR_MAGIC; +} + +#else /* ! INCLUDE_VMWARE_BDOOR */ + +static int +is_vmware_platform(void) +{ + return 0; +} + +#endif /* INCLUDE_VMWARE_BDOOR */ + +static void +read_hypervisor(struct lscpu_desc *desc, struct lscpu_modifier *mod) +{ + FILE *fd; + + /* We have to detect WSL first. is_vmware_platform() crashes on Windows 10. */ + + if ((fd = ul_path_fopen(desc->procfs, "r", "sys/kernel/osrelease"))) { + char buf[256]; + + if (fgets(buf, sizeof(buf), fd) != NULL) { + if (strstr(buf, "Microsoft")) { + desc->hyper = HYPER_WSL; + desc->virtype = VIRT_CONT; + } + } + fclose(fd); + if (desc->virtype) + return; + } + + if (mod->system != SYSTEM_SNAPSHOT) { + read_hypervisor_cpuid(desc); + if (!desc->hyper) + desc->hyper = read_hypervisor_dmi(); + if (!desc->hyper && is_vmware_platform()) + desc->hyper = HYPER_VMWARE; + } + + if (desc->hyper) { + desc->virtype = VIRT_FULL; + + if (desc->hyper == HYPER_XEN) { + uint32_t features; + + fd = ul_prefix_fopen(desc->prefix, "r", _PATH_SYS_HYP_FEATURES); + + if (fd && fscanf(fd, "%x", &features) == 1) { + /* Xen PV domain */ + if (features & XEN_FEATURES_PV_MASK) + desc->virtype = VIRT_PARA; + /* Xen PVH domain */ + else if ((features & XEN_FEATURES_PVH_MASK) + == XEN_FEATURES_PVH_MASK) + desc->virtype = VIRT_PARA; + } + if (fd) + fclose(fd); + } + } else if (read_hypervisor_powerpc(desc) > 0) {} + + /* Xen para-virt or dom0 */ + else if (ul_path_access(desc->procfs, F_OK, "xen") == 0) { + int dom0 = 0; + + fd = ul_path_fopen(desc->procfs, "r", "xen/capabilities"); + if (fd) { + char buf[256]; + + if (fscanf(fd, "%255s", buf) == 1 && + !strcmp(buf, "control_d")) + dom0 = 1; + fclose(fd); + } + desc->virtype = dom0 ? VIRT_NONE : VIRT_PARA; + desc->hyper = HYPER_XEN; + + /* Xen full-virt on non-x86_64 */ + } else if (has_pci_device(desc, hv_vendor_pci[HYPER_XEN], hv_graphics_pci[HYPER_XEN])) { + desc->hyper = HYPER_XEN; + desc->virtype = VIRT_FULL; + } else if (has_pci_device(desc, hv_vendor_pci[HYPER_VMWARE], hv_graphics_pci[HYPER_VMWARE])) { + desc->hyper = HYPER_VMWARE; + desc->virtype = VIRT_FULL; + } else if (has_pci_device(desc, hv_vendor_pci[HYPER_VBOX], hv_graphics_pci[HYPER_VBOX])) { + desc->hyper = HYPER_VBOX; + desc->virtype = VIRT_FULL; + + /* IBM PR/SM */ + } else if ((fd = ul_path_fopen(desc->procfs, "r", "sysinfo"))) { + char buf[BUFSIZ]; + + desc->hyper = HYPER_IBM; + desc->hypervisor = "PR/SM"; + desc->virtype = VIRT_FULL; + while (fgets(buf, sizeof(buf), fd) != NULL) { + char *str, *p; + + if (!strstr(buf, "Control Program:")) + continue; + if (!strstr(buf, "KVM")) + desc->hyper = HYPER_IBM; + else + desc->hyper = HYPER_KVM; + p = strchr(buf, ':'); + if (!p) + continue; + xasprintf(&str, "%s", p + 1); + + /* remove leading, trailing and repeating whitespace */ + while (*str == ' ') + str++; + desc->hypervisor = str; + str += strlen(str) - 1; + while ((*str == '\n') || (*str == ' ')) + *(str--) = '\0'; + while ((str = strstr(desc->hypervisor, " "))) + memmove(str, str + 1, strlen(str)); + break; + } + fclose(fd); + } + + /* OpenVZ/Virtuozzo - /proc/vz dir should exist + * /proc/bc should not */ + else if (ul_path_access(desc->procfs, F_OK, "vz") == 0 && + ul_path_access(desc->procfs, F_OK, "bc") != 0) { + desc->hyper = HYPER_PARALLELS; + desc->virtype = VIRT_CONT; + + /* IBM */ + } else if (desc->vendor && + (strcmp(desc->vendor, "PowerVM Lx86") == 0 || + strcmp(desc->vendor, "IBM/S390") == 0)) { + desc->hyper = HYPER_IBM; + desc->virtype = VIRT_FULL; + + /* User-mode-linux */ + } else if (desc->modelname && strstr(desc->modelname, "UML")) { + desc->hyper = HYPER_UML; + desc->virtype = VIRT_PARA; + + /* Linux-VServer */ + } else if ((fd = ul_path_fopen(desc->procfs, "r", "self/status"))) { + char buf[BUFSIZ]; + char *val = NULL; + + while (fgets(buf, sizeof(buf), fd) != NULL) { + if (lookup(buf, "VxID", &val)) + break; + } + fclose(fd); + + if (val) { + char *org = val; + + while (isdigit(*val)) + ++val; + if (!*val) { + desc->hyper = HYPER_VSERVER; + desc->virtype = VIRT_CONT; + } + free(org); + } + } +} + +/* add @set to the @ary, unnecessary set is deallocated. */ +static int add_cpuset_to_array(cpu_set_t **ary, int *items, cpu_set_t *set) +{ + int i; + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + + if (!ary) + return -1; + + for (i = 0; i < *items; i++) { + if (CPU_EQUAL_S(setsize, set, ary[i])) + break; + } + if (i == *items) { + ary[*items] = set; + ++*items; + return 0; + } + CPU_FREE(set); + return 1; +} + +static void +read_topology(struct lscpu_desc *desc, int idx) +{ + cpu_set_t *thread_siblings, *core_siblings; + cpu_set_t *book_siblings, *drawer_siblings; + int coreid, socketid, bookid, drawerid; + int i, num = real_cpu_num(desc, idx); + + if (ul_path_accessf(desc->syscpu, F_OK, "cpu%d/topology/thread_siblings", num) != 0) + return; + + ul_path_readf_cpuset(desc->syscpu, &thread_siblings, maxcpus, + "cpu%d/topology/thread_siblings", num); + ul_path_readf_cpuset(desc->syscpu, &core_siblings, maxcpus, + "cpu%d/topology/core_siblings", num); + ul_path_readf_cpuset(desc->syscpu, &book_siblings, maxcpus, + "cpu%d/topology/book_siblings", num); + ul_path_readf_cpuset(desc->syscpu, &drawer_siblings, maxcpus, + "cpu%d/topology/drawer_siblings", num); + + if (ul_path_readf_s32(desc->syscpu, &coreid, "cpu%d/topology/core_id", num) != 0) + coreid = -1; + + if (ul_path_readf_s32(desc->syscpu, &socketid, "cpu%d/topology/physical_package_id", num) != 0) + socketid = -1; + + if (ul_path_readf_s32(desc->syscpu, &bookid, "cpu%d/topology/book_id", num) != 0) + bookid = -1; + + if (ul_path_readf_s32(desc->syscpu, &drawerid, "cpu%d/topology/drawer_id", num) != 0) + drawerid = -1; + + if (!desc->coremaps) { + int ndrawers, nbooks, nsockets, ncores, nthreads; + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + + /* threads within one core */ + nthreads = CPU_COUNT_S(setsize, thread_siblings); + if (!nthreads) + nthreads = 1; + + /* cores within one socket */ + ncores = CPU_COUNT_S(setsize, core_siblings) / nthreads; + if (!ncores) + ncores = 1; + + /* number of sockets within one book. Because of odd / + * non-present cpu maps and to keep calculation easy we make + * sure that nsockets and nbooks is at least 1. + */ + nsockets = desc->ncpus / nthreads / ncores; + if (!nsockets) + nsockets = 1; + + /* number of books */ + nbooks = desc->ncpus / nthreads / ncores / nsockets; + if (!nbooks) + nbooks = 1; + + /* number of drawers */ + ndrawers = desc->ncpus / nbooks / nthreads / ncores / nsockets; + if (!ndrawers) + ndrawers = 1; + + /* all threads, see also read_basicinfo() + * -- fallback for kernels without + * /sys/devices/system/cpu/online. + */ + if (!desc->nthreads) + desc->nthreads = ndrawers * nbooks * nsockets * ncores * nthreads; + + /* For each map we make sure that it can have up to ncpuspos + * entries. This is because we cannot reliably calculate the + * number of cores, sockets and books on all architectures. + * E.g. completely virtualized architectures like s390 may + * have multiple sockets of different sizes. + */ + desc->coremaps = xcalloc(desc->ncpuspos, sizeof(cpu_set_t *)); + desc->socketmaps = xcalloc(desc->ncpuspos, sizeof(cpu_set_t *)); + desc->coreids = xcalloc(desc->ncpuspos, sizeof(*desc->drawerids)); + desc->socketids = xcalloc(desc->ncpuspos, sizeof(*desc->drawerids)); + for (i = 0; i < desc->ncpuspos; i++) + desc->coreids[i] = desc->socketids[i] = -1; + if (book_siblings) { + desc->bookmaps = xcalloc(desc->ncpuspos, sizeof(cpu_set_t *)); + desc->bookids = xcalloc(desc->ncpuspos, sizeof(*desc->drawerids)); + for (i = 0; i < desc->ncpuspos; i++) + desc->bookids[i] = -1; + } + if (drawer_siblings) { + desc->drawermaps = xcalloc(desc->ncpuspos, sizeof(cpu_set_t *)); + desc->drawerids = xcalloc(desc->ncpuspos, sizeof(*desc->drawerids)); + for (i = 0; i < desc->ncpuspos; i++) + desc->drawerids[i] = -1; + } + } + + add_cpuset_to_array(desc->socketmaps, &desc->nsockets, core_siblings); + desc->coreids[idx] = coreid; + add_cpuset_to_array(desc->coremaps, &desc->ncores, thread_siblings); + desc->socketids[idx] = socketid; + if (book_siblings) { + add_cpuset_to_array(desc->bookmaps, &desc->nbooks, book_siblings); + desc->bookids[idx] = bookid; + } + if (drawer_siblings) { + add_cpuset_to_array(desc->drawermaps, &desc->ndrawers, drawer_siblings); + desc->drawerids[idx] = drawerid; + } +} + +static void +read_polarization(struct lscpu_desc *desc, int idx) +{ + char mode[64]; + int num = real_cpu_num(desc, idx); + + if (desc->dispatching < 0) + return; + if (ul_path_accessf(desc->syscpu, F_OK, "cpu%d/polarization", num) != 0) + return; + if (!desc->polarization) + desc->polarization = xcalloc(desc->ncpuspos, sizeof(int)); + + ul_path_readf_buffer(desc->syscpu, mode, sizeof(mode), "cpu%d/polarization", num); + + if (strncmp(mode, "vertical:low", sizeof(mode)) == 0) + desc->polarization[idx] = POLAR_VLOW; + else if (strncmp(mode, "vertical:medium", sizeof(mode)) == 0) + desc->polarization[idx] = POLAR_VMEDIUM; + else if (strncmp(mode, "vertical:high", sizeof(mode)) == 0) + desc->polarization[idx] = POLAR_VHIGH; + else if (strncmp(mode, "horizontal", sizeof(mode)) == 0) + desc->polarization[idx] = POLAR_HORIZONTAL; + else + desc->polarization[idx] = POLAR_UNKNOWN; +} + +static void +read_address(struct lscpu_desc *desc, int idx) +{ + int num = real_cpu_num(desc, idx); + + if (ul_path_accessf(desc->syscpu, F_OK, "cpu%d/address", num) != 0) + return; + if (!desc->addresses) + desc->addresses = xcalloc(desc->ncpuspos, sizeof(int)); + ul_path_readf_s32(desc->syscpu, &desc->addresses[idx], "cpu%d/address", num); +} + +static void +read_configured(struct lscpu_desc *desc, int idx) +{ + int num = real_cpu_num(desc, idx); + + if (ul_path_accessf(desc->syscpu, F_OK, "cpu%d/configure", num) != 0) + return; + if (!desc->configured) + desc->configured = xcalloc(desc->ncpuspos, sizeof(int)); + ul_path_readf_s32(desc->syscpu, &desc->configured[idx], "cpu%d/configure", num); +} + +/* Read overall maximum frequency of cpu */ +static char * +cpu_max_mhz(struct lscpu_desc *desc, char *buf, size_t bufsz) +{ + int i; + float cpu_freq = 0.0; + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + + if (desc->present) { + for (i = 0; i < desc->ncpuspos; i++) { + if (CPU_ISSET_S(real_cpu_num(desc, i), setsize, desc->present) + && desc->maxmhz[i]) { + float freq = atof(desc->maxmhz[i]); + + if (freq > cpu_freq) + cpu_freq = freq; + } + } + } + snprintf(buf, bufsz, "%.4f", cpu_freq); + return buf; +} + +/* Read overall minimum frequency of cpu */ +static char * +cpu_min_mhz(struct lscpu_desc *desc, char *buf, size_t bufsz) +{ + int i; + float cpu_freq = -1.0; + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + + if (desc->present) { + for (i = 0; i < desc->ncpuspos; i++) { + if (CPU_ISSET_S(real_cpu_num(desc, i), setsize, desc->present) + && desc->minmhz[i]) { + float freq = atof(desc->minmhz[i]); + + if (cpu_freq < 0.0 || freq < cpu_freq) + cpu_freq = freq; + } + } + } + snprintf(buf, bufsz, "%.4f", cpu_freq); + return buf; +} + + +static void +read_max_mhz(struct lscpu_desc *desc, int idx) +{ + int num = real_cpu_num(desc, idx); + int mhz; + + if (ul_path_readf_s32(desc->syscpu, &mhz, "cpu%d/cpufreq/cpuinfo_max_freq", num) != 0) + return; + if (!desc->maxmhz) + desc->maxmhz = xcalloc(desc->ncpuspos, sizeof(char *)); + xasprintf(&desc->maxmhz[idx], "%.4f", (float) mhz / 1000); +} + +static void +read_min_mhz(struct lscpu_desc *desc, int idx) +{ + int num = real_cpu_num(desc, idx); + int mhz; + + if (ul_path_readf_s32(desc->syscpu, &mhz, "cpu%d/cpufreq/cpuinfo_min_freq", num) != 0) + return; + if (!desc->minmhz) + desc->minmhz = xcalloc(desc->ncpuspos, sizeof(char *)); + xasprintf(&desc->minmhz[idx], "%.4f", (float) mhz / 1000); +} + +static int +cachecmp(const void *a, const void *b) +{ + struct cpu_cache *c1 = (struct cpu_cache *) a; + struct cpu_cache *c2 = (struct cpu_cache *) b; + + return strcmp(c2->name, c1->name); +} + +static void +read_cache(struct lscpu_desc *desc, int idx) +{ + char buf[256]; + int i; + int num = real_cpu_num(desc, idx); + + if (!desc->ncaches) { + while (ul_path_accessf(desc->syscpu, F_OK, + "cpu%d/cache/index%d", + num, desc->ncaches) == 0) + desc->ncaches++; + + if (!desc->ncaches) + return; + desc->caches = xcalloc(desc->ncaches, sizeof(*desc->caches)); + } + for (i = 0; i < desc->ncaches; i++) { + struct cpu_cache *ca = &desc->caches[i]; + cpu_set_t *map; + + if (ul_path_accessf(desc->syscpu, F_OK, + "cpu%d/cache/index%d", num, i) != 0) + continue; + if (!ca->name) { + int type = 0, level; + + /* cache type */ + if (ul_path_readf_buffer(desc->syscpu, buf, sizeof(buf), + "cpu%d/cache/index%d/type", num, i) > 0) { + if (!strcmp(buf, "Data")) + type = 'd'; + else if (!strcmp(buf, "Instruction")) + type = 'i'; + } + + /* cache level */ + ul_path_readf_s32(desc->syscpu, &level, + "cpu%d/cache/index%d/level", num, i); + if (type) + snprintf(buf, sizeof(buf), "L%d%c", level, type); + else + snprintf(buf, sizeof(buf), "L%d", level); + + ca->name = xstrdup(buf); + + /* cache size */ + if (ul_path_readf_string(desc->syscpu, &ca->size, + "cpu%d/cache/index%d/size", num, i) < 0) + ca->size = xstrdup("unknown size"); + } + + /* information about how CPUs share different caches */ + ul_path_readf_cpuset(desc->syscpu, &map, maxcpus, + "cpu%d/cache/index%d/shared_cpu_map", num, i); + + if (!ca->sharedmaps) + ca->sharedmaps = xcalloc(desc->ncpuspos, sizeof(cpu_set_t *)); + add_cpuset_to_array(ca->sharedmaps, &ca->nsharedmaps, map); + } +} + +static inline int is_node_dirent(struct dirent *d) +{ + return + d && +#ifdef _DIRENT_HAVE_D_TYPE + (d->d_type == DT_DIR || d->d_type == DT_UNKNOWN) && +#endif + strncmp(d->d_name, "node", 4) == 0 && + isdigit_string(d->d_name + 4); +} + +static int +nodecmp(const void *ap, const void *bp) +{ + int *a = (int *) ap, *b = (int *) bp; + return *a - *b; +} + +static void +read_nodes(struct lscpu_desc *desc) +{ + int i = 0; + DIR *dir; + struct dirent *d; + struct path_cxt *sysnode; + + desc->nnodes = 0; + + sysnode = ul_new_path(_PATH_SYS_NODE); + if (!sysnode) + err(EXIT_FAILURE, _("failed to initialize %s handler"), _PATH_SYS_NODE); + ul_path_set_prefix(sysnode, desc->prefix); + + dir = ul_path_opendir(sysnode, NULL); + if (!dir) + goto done; + + while ((d = readdir(dir))) { + if (is_node_dirent(d)) + desc->nnodes++; + } + + if (!desc->nnodes) { + closedir(dir); + goto done; + } + + desc->nodemaps = xcalloc(desc->nnodes, sizeof(cpu_set_t *)); + desc->idx2nodenum = xmalloc(desc->nnodes * sizeof(int)); + + rewinddir(dir); + while ((d = readdir(dir)) && i < desc->nnodes) { + if (is_node_dirent(d)) + desc->idx2nodenum[i++] = strtol_or_err(((d->d_name) + 4), + _("Failed to extract the node number")); + } + closedir(dir); + qsort(desc->idx2nodenum, desc->nnodes, sizeof(int), nodecmp); + + /* information about how nodes share different CPUs */ + for (i = 0; i < desc->nnodes; i++) + ul_path_readf_cpuset(sysnode, &desc->nodemaps[i], maxcpus, + "node%d/cpumap", desc->idx2nodenum[i]); +done: + ul_unref_path(sysnode); +} + +static char * +get_cell_data(struct lscpu_desc *desc, int idx, int col, + struct lscpu_modifier *mod, + char *buf, size_t bufsz) +{ + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + size_t i; + int cpu = real_cpu_num(desc, idx); + + *buf = '\0'; + + switch (col) { + case COL_CPU: + snprintf(buf, bufsz, "%d", cpu); + break; + case COL_CORE: + if (mod->physical) { + if (desc->coreids[idx] == -1) + snprintf(buf, bufsz, "-"); + else + snprintf(buf, bufsz, "%d", desc->coreids[idx]); + } else { + if (cpuset_ary_isset(cpu, desc->coremaps, + desc->ncores, setsize, &i) == 0) + snprintf(buf, bufsz, "%zu", i); + } + break; + case COL_SOCKET: + if (mod->physical) { + if (desc->socketids[idx] == -1) + snprintf(buf, bufsz, "-"); + else + snprintf(buf, bufsz, "%d", desc->socketids[idx]); + } else { + if (cpuset_ary_isset(cpu, desc->socketmaps, + desc->nsockets, setsize, &i) == 0) + snprintf(buf, bufsz, "%zu", i); + } + break; + case COL_NODE: + if (cpuset_ary_isset(cpu, desc->nodemaps, + desc->nnodes, setsize, &i) == 0) + snprintf(buf, bufsz, "%d", desc->idx2nodenum[i]); + break; + case COL_DRAWER: + if (mod->physical) { + if (desc->drawerids[idx] == -1) + snprintf(buf, bufsz, "-"); + else + snprintf(buf, bufsz, "%d", desc->drawerids[idx]); + } else { + if (cpuset_ary_isset(cpu, desc->drawermaps, + desc->ndrawers, setsize, &i) == 0) + snprintf(buf, bufsz, "%zu", i); + } + break; + case COL_BOOK: + if (mod->physical) { + if (desc->bookids[idx] == -1) + snprintf(buf, bufsz, "-"); + else + snprintf(buf, bufsz, "%d", desc->bookids[idx]); + } else { + if (cpuset_ary_isset(cpu, desc->bookmaps, + desc->nbooks, setsize, &i) == 0) + snprintf(buf, bufsz, "%zu", i); + } + break; + case COL_CACHE: + { + char *p = buf; + size_t sz = bufsz; + int j; + + for (j = desc->ncaches - 1; j >= 0; j--) { + struct cpu_cache *ca = &desc->caches[j]; + + if (cpuset_ary_isset(cpu, ca->sharedmaps, + ca->nsharedmaps, setsize, &i) == 0) { + int x = snprintf(p, sz, "%zu", i); + if (x < 0 || (size_t) x >= sz) + return NULL; + p += x; + sz -= x; + } + if (j != 0) { + if (sz < 2) + return NULL; + *p++ = mod->compat ? ',' : ':'; + *p = '\0'; + sz--; + } + } + break; + } + case COL_POLARIZATION: + if (desc->polarization) { + int x = desc->polarization[idx]; + + snprintf(buf, bufsz, "%s", + mod->mode == OUTPUT_PARSABLE ? + polar_modes[x].parsable : + polar_modes[x].readable); + } + break; + case COL_ADDRESS: + if (desc->addresses) + snprintf(buf, bufsz, "%d", desc->addresses[idx]); + break; + case COL_CONFIGURED: + if (!desc->configured) + break; + if (mod->mode == OUTPUT_PARSABLE) + snprintf(buf, bufsz, "%s", + desc->configured[idx] ? _("Y") : _("N")); + else + snprintf(buf, bufsz, "%s", + desc->configured[idx] ? _("yes") : _("no")); + break; + case COL_ONLINE: + if (!desc->online) + break; + if (mod->mode == OUTPUT_PARSABLE) + snprintf(buf, bufsz, "%s", + is_cpu_online(desc, cpu) ? _("Y") : _("N")); + else + snprintf(buf, bufsz, "%s", + is_cpu_online(desc, cpu) ? _("yes") : _("no")); + break; + case COL_MAXMHZ: + if (desc->maxmhz && desc->maxmhz[idx]) + xstrncpy(buf, desc->maxmhz[idx], bufsz); + break; + case COL_MINMHZ: + if (desc->minmhz && desc->minmhz[idx]) + xstrncpy(buf, desc->minmhz[idx], bufsz); + break; + } + return buf; +} + +static char * +get_cell_header(struct lscpu_desc *desc, int col, + struct lscpu_modifier *mod, + char *buf, size_t bufsz) +{ + *buf = '\0'; + + if (col == COL_CACHE) { + char *p = buf; + size_t sz = bufsz; + int i; + + for (i = desc->ncaches - 1; i >= 0; i--) { + int x = snprintf(p, sz, "%s", desc->caches[i].name); + if (x < 0 || (size_t) x >= sz) + return NULL; + sz -= x; + p += x; + if (i > 0) { + if (sz < 2) + return NULL; + *p++ = mod->compat ? ',' : ':'; + *p = '\0'; + sz--; + } + } + if (desc->ncaches) + return buf; + } + snprintf(buf, bufsz, "%s", coldescs[col].name); + return buf; +} + +/* + * [-p] backend, we support two parsable formats: + * + * 1) "compatible" -- this format is compatible with the original lscpu(1) + * output and it contains fixed set of the columns. The CACHE columns are at + * the end of the line and the CACHE is not printed if the number of the caches + * is zero. The CACHE columns are separated by two commas, for example: + * + * $ lscpu --parse + * # CPU,Core,Socket,Node,,L1d,L1i,L2 + * 0,0,0,0,,0,0,0 + * 1,1,0,0,,1,1,0 + * + * 2) "user defined output" -- this format prints always all columns without + * special prefix for CACHE column. If there are not CACHEs then the column is + * empty and the header "Cache" is printed rather than a real name of the cache. + * The CACHE columns are separated by ':'. + * + * $ lscpu --parse=CPU,CORE,SOCKET,NODE,CACHE + * # CPU,Core,Socket,Node,L1d:L1i:L2 + * 0,0,0,0,0:0:0 + * 1,1,0,0,1:1:0 + */ +static void +print_parsable(struct lscpu_desc *desc, int cols[], int ncols, + struct lscpu_modifier *mod) +{ + char buf[BUFSIZ], *data; + int i; + + /* + * Header + */ + printf(_( + "# The following is the parsable format, which can be fed to other\n" + "# programs. Each different item in every column has an unique ID\n" + "# starting from zero.\n")); + + fputs("# ", stdout); + for (i = 0; i < ncols; i++) { + int col = cols[i]; + + if (col == COL_CACHE) { + if (mod->compat && !desc->ncaches) + continue; + if (mod->compat && i != 0) + putchar(','); + } + if (i > 0) + putchar(','); + + data = get_cell_header(desc, col, mod, buf, sizeof(buf)); + + if (data && * data && col != COL_CACHE && + !coldescs[col].is_abbr) { + /* + * For normal column names use mixed case (e.g. "Socket") + */ + char *p = data + 1; + + while (p && *p != '\0') { + *p = tolower((unsigned int) *p); + p++; + } + } + fputs(data && *data ? data : "", stdout); + } + putchar('\n'); + + /* + * Data + */ + for (i = 0; i < desc->ncpuspos; i++) { + int c; + int cpu = real_cpu_num(desc, i); + + if (!mod->offline && desc->online && !is_cpu_online(desc, cpu)) + continue; + if (!mod->online && desc->online && is_cpu_online(desc, cpu)) + continue; + if (desc->present && !is_cpu_present(desc, cpu)) + continue; + for (c = 0; c < ncols; c++) { + if (mod->compat && cols[c] == COL_CACHE) { + if (!desc->ncaches) + continue; + if (c > 0) + putchar(','); + } + if (c > 0) + putchar(','); + + data = get_cell_data(desc, i, cols[c], mod, + buf, sizeof(buf)); + fputs(data && *data ? data : "", stdout); + } + putchar('\n'); + } +} + +/* + * [-e] backend + */ +static void +print_readable(struct lscpu_desc *desc, int cols[], int ncols, + struct lscpu_modifier *mod) +{ + int i; + char buf[BUFSIZ]; + const char *data; + struct libscols_table *table; + + scols_init_debug(0); + + table = scols_new_table(); + if (!table) + err(EXIT_FAILURE, _("failed to allocate output table")); + if (mod->json) { + scols_table_enable_json(table, 1); + scols_table_set_name(table, "cpus"); + } + + for (i = 0; i < ncols; i++) { + data = get_cell_header(desc, cols[i], mod, buf, sizeof(buf)); + if (!scols_table_new_column(table, data, 0, 0)) + err(EXIT_FAILURE, _("failed to allocate output column")); + } + + for (i = 0; i < desc->ncpuspos; i++) { + int c; + struct libscols_line *line; + int cpu = real_cpu_num(desc, i); + + if (!mod->offline && desc->online && !is_cpu_online(desc, cpu)) + continue; + if (!mod->online && desc->online && is_cpu_online(desc, cpu)) + continue; + if (desc->present && !is_cpu_present(desc, cpu)) + continue; + + line = scols_table_new_line(table, NULL); + if (!line) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (c = 0; c < ncols; c++) { + data = get_cell_data(desc, i, cols[c], mod, + buf, sizeof(buf)); + if (!data || !*data) + data = "-"; + if (scols_line_set_data(line, c, data)) + err(EXIT_FAILURE, _("failed to add output data")); + } + } + + scols_print_table(table); + scols_unref_table(table); +} + + +static void __attribute__ ((__format__(printf, 3, 4))) + add_summary_sprint(struct libscols_table *tb, + const char *txt, + const char *fmt, + ...) +{ + struct libscols_line *ln = scols_table_new_line(tb, NULL); + char *data; + va_list args; + + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + /* description column */ + scols_line_set_data(ln, 0, txt); + + /* data column */ + va_start(args, fmt); + xvasprintf(&data, fmt, args); + va_end(args); + + if (data && scols_line_refer_data(ln, 1, data)) + err(EXIT_FAILURE, _("failed to add output data")); +} + +#define add_summary_n(tb, txt, num) add_summary_sprint(tb, txt, "%d", num) +#define add_summary_s(tb, txt, str) add_summary_sprint(tb, txt, "%s", str) + +static void +print_cpuset(struct libscols_table *tb, + const char *key, cpu_set_t *set, int hex) +{ + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + size_t setbuflen = 7 * maxcpus; + char setbuf[setbuflen], *p; + + if (hex) { + p = cpumask_create(setbuf, setbuflen, set, setsize); + add_summary_s(tb, key, p); + } else { + p = cpulist_create(setbuf, setbuflen, set, setsize); + add_summary_s(tb, key, p); + } +} + +/* + * default output + */ +static void +print_summary(struct lscpu_desc *desc, struct lscpu_modifier *mod) +{ + char buf[BUFSIZ]; + int i = 0; + size_t setsize = CPU_ALLOC_SIZE(maxcpus); + struct libscols_table *tb; + + scols_init_debug(0); + + tb = scols_new_table(); + if (!tb) + err(EXIT_FAILURE, _("failed to allocate output table")); + + scols_table_enable_noheadings(tb, 1); + if (mod->json) { + scols_table_enable_json(tb, 1); + scols_table_set_name(tb, "lscpu"); + } + + if (scols_table_new_column(tb, "field", 0, 0) == NULL || + scols_table_new_column(tb, "data", 0, SCOLS_FL_NOEXTREMES) == NULL) + err(EXIT_FAILURE, _("failed to initialize output column")); + + add_summary_s(tb, _("Architecture:"), desc->arch); + if (desc->mode) { + char *p = buf; + + if (desc->mode & MODE_32BIT) { + strcpy(p, "32-bit, "); + p += 8; + } + if (desc->mode & MODE_64BIT) { + strcpy(p, "64-bit, "); + p += 8; + } + *(p - 2) = '\0'; + add_summary_s(tb, _("CPU op-mode(s):"), buf); + } +#if !defined(WORDS_BIGENDIAN) + add_summary_s(tb, _("Byte Order:"), "Little Endian"); +#else + add_summary_s(tb, _("Byte Order:"), "Big Endian"); +#endif + + if (desc->addrsz) + add_summary_s(tb, _("Address sizes:"), desc->addrsz); + + add_summary_n(tb, _("CPU(s):"), desc->ncpus); + + if (desc->online) + print_cpuset(tb, mod->hex ? _("On-line CPU(s) mask:") : + _("On-line CPU(s) list:"), + desc->online, mod->hex); + + if (desc->online && CPU_COUNT_S(setsize, desc->online) != desc->ncpus) { + cpu_set_t *set; + + /* Linux kernel provides cpuset of off-line CPUs that contains + * all configured CPUs (see /sys/devices/system/cpu/offline), + * but want to print real (present in system) off-line CPUs only. + */ + set = cpuset_alloc(maxcpus, NULL, NULL); + if (!set) + err(EXIT_FAILURE, _("failed to callocate cpu set")); + CPU_ZERO_S(setsize, set); + for (i = 0; i < desc->ncpuspos; i++) { + int cpu = real_cpu_num(desc, i); + if (!is_cpu_online(desc, cpu) && is_cpu_present(desc, cpu)) + CPU_SET_S(cpu, setsize, set); + } + print_cpuset(tb, mod->hex ? _("Off-line CPU(s) mask:") : + _("Off-line CPU(s) list:"), + set, mod->hex); + cpuset_free(set); + } + + if (desc->nsockets) { + int threads_per_core, cores_per_socket, sockets_per_book; + int books_per_drawer, drawers; + FILE *fd; + + threads_per_core = cores_per_socket = sockets_per_book = 0; + books_per_drawer = drawers = 0; + /* s390 detects its cpu topology via /proc/sysinfo, if present. + * Using simply the cpu topology masks in sysfs will not give + * usable results since everything is virtualized. E.g. + * virtual core 0 may have only 1 cpu, but virtual core 2 may + * five cpus. + * If the cpu topology is not exported (e.g. 2nd level guest) + * fall back to old calculation scheme. + */ + if ((fd = ul_path_fopen(desc->procfs, "r", "sysinfo"))) { + int t0, t1; + + while (fd && fgets(buf, sizeof(buf), fd) != NULL) { + if (sscanf(buf, "CPU Topology SW:%d%d%d%d%d%d", + &t0, &t1, &drawers, &books_per_drawer, + &sockets_per_book, + &cores_per_socket) == 6) + break; + } + if (fd) + fclose(fd); + } + if (desc->mtid) + threads_per_core = atoi(desc->mtid) + 1; + add_summary_n(tb, _("Thread(s) per core:"), + threads_per_core ?: desc->nthreads / desc->ncores); + add_summary_n(tb, _("Core(s) per socket:"), + cores_per_socket ?: desc->ncores / desc->nsockets); + if (desc->nbooks) { + add_summary_n(tb, _("Socket(s) per book:"), + sockets_per_book ?: desc->nsockets / desc->nbooks); + if (desc->ndrawers) { + add_summary_n(tb, _("Book(s) per drawer:"), + books_per_drawer ?: desc->nbooks / desc->ndrawers); + add_summary_n(tb, _("Drawer(s):"), drawers ?: desc->ndrawers); + } else { + add_summary_n(tb, _("Book(s):"), books_per_drawer ?: desc->nbooks); + } + } else { + add_summary_n(tb, _("Socket(s):"), sockets_per_book ?: desc->nsockets); + } + } + if (desc->nnodes) + add_summary_n(tb, _("NUMA node(s):"), desc->nnodes); + if (desc->vendor) + add_summary_s(tb, _("Vendor ID:"), desc->vendor); + if (desc->machinetype) + add_summary_s(tb, _("Machine type:"), desc->machinetype); + if (desc->family) + add_summary_s(tb, _("CPU family:"), desc->family); + if (desc->model || desc->revision) + add_summary_s(tb, _("Model:"), desc->revision ? desc->revision : desc->model); + if (desc->modelname || desc->cpu) + add_summary_s(tb, _("Model name:"), desc->cpu ? desc->cpu : desc->modelname); + if (desc->stepping) + add_summary_s(tb, _("Stepping:"), desc->stepping); + if (desc->mhz) + add_summary_s(tb, _("CPU MHz:"), desc->mhz); + if (desc->dynamic_mhz) + add_summary_s(tb, _("CPU dynamic MHz:"), desc->dynamic_mhz); + if (desc->static_mhz) + add_summary_s(tb, _("CPU static MHz:"), desc->static_mhz); + if (desc->maxmhz) + add_summary_s(tb, _("CPU max MHz:"), cpu_max_mhz(desc, buf, sizeof(buf))); + if (desc->minmhz) + add_summary_s(tb, _("CPU min MHz:"), cpu_min_mhz(desc, buf, sizeof(buf))); + if (desc->bogomips) + add_summary_s(tb, _("BogoMIPS:"), desc->bogomips); + if (desc->virtflag) { + if (!strcmp(desc->virtflag, "svm")) + add_summary_s(tb, _("Virtualization:"), "AMD-V"); + else if (!strcmp(desc->virtflag, "vmx")) + add_summary_s(tb, _("Virtualization:"), "VT-x"); + } + if (desc->hypervisor) + add_summary_s(tb, _("Hypervisor:"), desc->hypervisor); + if (desc->hyper) { + add_summary_s(tb, _("Hypervisor vendor:"), hv_vendors[desc->hyper]); + add_summary_s(tb, _("Virtualization type:"), _(virt_types[desc->virtype])); + } + if (desc->dispatching >= 0) + add_summary_s(tb, _("Dispatching mode:"), _(disp_modes[desc->dispatching])); + if (desc->ncaches) { + for (i = desc->ncaches - 1; i >= 0; i--) { + snprintf(buf, sizeof(buf), + _("%s cache:"), desc->caches[i].name); + add_summary_s(tb, buf, desc->caches[i].size); + } + } + if (desc->necaches) { + for (i = desc->necaches - 1; i >= 0; i--) { + snprintf(buf, sizeof(buf), + _("%s cache:"), desc->ecaches[i].name); + add_summary_s(tb, buf, desc->ecaches[i].size); + } + } + + for (i = 0; i < desc->nnodes; i++) { + snprintf(buf, sizeof(buf), _("NUMA node%d CPU(s):"), desc->idx2nodenum[i]); + print_cpuset(tb, buf, desc->nodemaps[i], mod->hex); + } + + if (desc->physsockets) { + add_summary_n(tb, _("Physical sockets:"), desc->physsockets); + add_summary_n(tb, _("Physical chips:"), desc->physchips); + add_summary_n(tb, _("Physical cores/chip:"), desc->physcoresperchip); + } + + if (desc->flags) + add_summary_s(tb, _("Flags:"), desc->flags); + + scols_print_table(tb); + scols_unref_table(tb); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Display information about the CPU architecture.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all print both online and offline CPUs (default for -e)\n"), out); + fputs(_(" -b, --online print online CPUs only (default for -p)\n"), out); + fputs(_(" -c, --offline print offline CPUs only\n"), out); + fputs(_(" -J, --json use JSON for default or extended format\n"), out); + fputs(_(" -e, --extended[=<list>] print out an extended readable format\n"), out); + fputs(_(" -p, --parse[=<list>] print out a parsable format\n"), out); + fputs(_(" -s, --sysroot <dir> use specified directory as system root\n"), out); + fputs(_(" -x, --hex print hexadecimal masks rather than lists of CPUs\n"), out); + fputs(_(" -y, --physical print physical instead of logical IDs\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(25)); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(coldescs); i++) + fprintf(out, " %13s %s\n", coldescs[i].name, _(coldescs[i].help)); + + printf(USAGE_MAN_TAIL("lscpu(1)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + struct lscpu_modifier _mod = { .mode = OUTPUT_SUMMARY }, *mod = &_mod; + struct lscpu_desc _desc = { .flags = NULL }, *desc = &_desc; + int c, i; + int columns[ARRAY_SIZE(coldescs)], ncolumns = 0; + int cpu_modifier_specified = 0; + size_t setsize; + + enum { + OPT_OUTPUT_ALL = CHAR_MAX + 1, + }; + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "online", no_argument, NULL, 'b' }, + { "offline", no_argument, NULL, 'c' }, + { "help", no_argument, NULL, 'h' }, + { "extended", optional_argument, NULL, 'e' }, + { "json", no_argument, NULL, 'J' }, + { "parse", optional_argument, NULL, 'p' }, + { "sysroot", required_argument, NULL, 's' }, + { "physical", no_argument, NULL, 'y' }, + { "hex", no_argument, NULL, 'x' }, + { "version", no_argument, NULL, 'V' }, + { "output-all", no_argument, NULL, OPT_OUTPUT_ALL }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'a','b','c' }, + { 'e','p' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "abce::hJp::s:xyV", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'a': + mod->online = mod->offline = 1; + cpu_modifier_specified = 1; + break; + case 'b': + mod->online = 1; + cpu_modifier_specified = 1; + break; + case 'c': + mod->offline = 1; + cpu_modifier_specified = 1; + break; + case 'h': + usage(); + case 'J': + mod->json = 1; + break; + case 'p': + case 'e': + if (optarg) { + if (*optarg == '=') + optarg++; + ncolumns = string_to_idarray(optarg, + columns, ARRAY_SIZE(columns), + column_name_to_id); + if (ncolumns < 0) + return EXIT_FAILURE; + } + mod->mode = c == 'p' ? OUTPUT_PARSABLE : OUTPUT_READABLE; + break; + case 's': + desc->prefix = optarg; + mod->system = SYSTEM_SNAPSHOT; + break; + case 'x': + mod->hex = 1; + break; + case 'y': + mod->physical = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case OPT_OUTPUT_ALL: + { + size_t sz; + for (sz = 0; sz < ARRAY_SIZE(coldescs); sz++) + columns[sz] = 1; + break; + } + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (cpu_modifier_specified && mod->mode == OUTPUT_SUMMARY) { + fprintf(stderr, + _("%s: options --all, --online and --offline may only " + "be used with options --extended or --parse.\n"), + program_invocation_short_name); + return EXIT_FAILURE; + } + + if (argc != optind) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + /* set default cpu display mode if none was specified */ + if (!mod->online && !mod->offline) { + mod->online = 1; + mod->offline = mod->mode == OUTPUT_READABLE ? 1 : 0; + } + + ul_path_init_debug(); + + /* /sys/devices/system/cpu */ + desc->syscpu = ul_new_path(_PATH_SYS_CPU); + if (!desc->syscpu) + err(EXIT_FAILURE, _("failed to initialize CPUs sysfs handler")); + if (desc->prefix) + ul_path_set_prefix(desc->syscpu, desc->prefix); + + /* /proc */ + desc->procfs = ul_new_path("/proc"); + if (!desc->procfs) + err(EXIT_FAILURE, _("failed to initialize procfs handler")); + if (desc->prefix) + ul_path_set_prefix(desc->procfs, desc->prefix); + + read_basicinfo(desc, mod); + + setsize = CPU_ALLOC_SIZE(maxcpus); + + for (i = 0; i < desc->ncpuspos; i++) { + /* only consider present CPUs */ + if (desc->present && + !CPU_ISSET_S(real_cpu_num(desc, i), setsize, desc->present)) + continue; + read_topology(desc, i); + read_cache(desc, i); + read_polarization(desc, i); + read_address(desc, i); + read_configured(desc, i); + read_max_mhz(desc, i); + read_min_mhz(desc, i); + } + + if (desc->caches) + qsort(desc->caches, desc->ncaches, + sizeof(struct cpu_cache), cachecmp); + + if (desc->ecaches) + qsort(desc->ecaches, desc->necaches, + sizeof(struct cpu_cache), cachecmp); + + read_nodes(desc); + read_hypervisor(desc, mod); + arm_cpu_decode(desc); + + switch(mod->mode) { + case OUTPUT_SUMMARY: + print_summary(desc, mod); + break; + case OUTPUT_PARSABLE: + if (!ncolumns) { + columns[ncolumns++] = COL_CPU; + columns[ncolumns++] = COL_CORE; + columns[ncolumns++] = COL_SOCKET; + columns[ncolumns++] = COL_NODE; + columns[ncolumns++] = COL_CACHE; + mod->compat = 1; + } + print_parsable(desc, columns, ncolumns, mod); + break; + case OUTPUT_READABLE: + if (!ncolumns) { + /* No list was given. Just print whatever is there. */ + columns[ncolumns++] = COL_CPU; + if (desc->nodemaps) + columns[ncolumns++] = COL_NODE; + if (desc->drawermaps) + columns[ncolumns++] = COL_DRAWER; + if (desc->bookmaps) + columns[ncolumns++] = COL_BOOK; + if (desc->socketmaps) + columns[ncolumns++] = COL_SOCKET; + if (desc->coremaps) + columns[ncolumns++] = COL_CORE; + if (desc->caches) + columns[ncolumns++] = COL_CACHE; + if (desc->online) + columns[ncolumns++] = COL_ONLINE; + if (desc->configured) + columns[ncolumns++] = COL_CONFIGURED; + if (desc->polarization) + columns[ncolumns++] = COL_POLARIZATION; + if (desc->addresses) + columns[ncolumns++] = COL_ADDRESS; + if (desc->maxmhz) + columns[ncolumns++] = COL_MAXMHZ; + if (desc->minmhz) + columns[ncolumns++] = COL_MINMHZ; + } + print_readable(desc, columns, ncolumns, mod); + break; + } + + ul_unref_path(desc->syscpu); + ul_unref_path(desc->procfs); + return EXIT_SUCCESS; +} diff --git a/sys-utils/lscpu.h b/sys-utils/lscpu.h new file mode 100644 index 0000000..24bc11e --- /dev/null +++ b/sys-utils/lscpu.h @@ -0,0 +1,194 @@ +#ifndef LSCPU_H +#define LSCPU_H + +#include "c.h" +#include "nls.h" +#include "cpuset.h" +#include "xalloc.h" +#include "strutils.h" +#include "bitops.h" +#include "path.h" +#include "pathnames.h" +#include "all-io.h" + +/* virtualization types */ +enum { + VIRT_NONE = 0, + VIRT_PARA, + VIRT_FULL, + VIRT_CONT +}; + +/* hypervisor vendors */ +enum { + HYPER_NONE = 0, + HYPER_XEN, + HYPER_KVM, + HYPER_MSHV, + HYPER_VMWARE, + HYPER_IBM, /* sys-z powervm */ + HYPER_VSERVER, + HYPER_UML, + HYPER_INNOTEK, /* VBOX */ + HYPER_HITACHI, + HYPER_PARALLELS, /* OpenVZ/VIrtuozzo */ + HYPER_VBOX, + HYPER_OS400, + HYPER_PHYP, + HYPER_SPAR, + HYPER_WSL, +}; + +/* CPU modes */ +enum { + MODE_32BIT = (1 << 1), + MODE_64BIT = (1 << 2) +}; + +/* cache(s) description */ +struct cpu_cache { + char *name; + char *size; + + int nsharedmaps; + cpu_set_t **sharedmaps; +}; + +/* dispatching modes */ +enum { + DISP_HORIZONTAL = 0, + DISP_VERTICAL = 1 +}; + +/* cpu polarization */ +enum { + POLAR_UNKNOWN = 0, + POLAR_VLOW, + POLAR_VMEDIUM, + POLAR_VHIGH, + POLAR_HORIZONTAL +}; + +struct polarization_modes { + char *parsable; + char *readable; +}; + + +/* global description */ +struct lscpu_desc { + const char *prefix; /* path to /sys and /proc snapshot or NULL */ + + struct path_cxt *syscpu; /* _PATH_SYS_CPU path handler */ + struct path_cxt *procfs; /* /proc path handler */ + + char *arch; + char *vendor; + char *machinetype; /* s390 */ + char *family; + char *model; + char *modelname; + char *revision; /* alternative for model (ppc) */ + char *cpu; /* alternative for modelname (ppc, sparc) */ + char *virtflag; /* virtualization flag (vmx, svm) */ + char *hypervisor; /* hypervisor software */ + int hyper; /* hypervisor vendor ID */ + int virtype; /* VIRT_PARA|FULL|NONE ? */ + char *mhz; + char *dynamic_mhz; /* dynamic mega hertz (s390) */ + char *static_mhz; /* static mega hertz (s390) */ + char **maxmhz; /* maximum mega hertz */ + char **minmhz; /* minimum mega hertz */ + char *stepping; + char *bogomips; + char *flags; + char *mtid; /* maximum thread id (s390) */ + char *addrsz; /* address sizes */ + int dispatching; /* none, horizontal or vertical */ + int mode; /* rm, lm or/and tm */ + + int ncpuspos; /* maximal possible CPUs */ + int ncpus; /* number of present CPUs */ + cpu_set_t *present; /* mask with present CPUs */ + cpu_set_t *online; /* mask with online CPUs */ + + int nthreads; /* number of online threads */ + + int ncaches; + struct cpu_cache *caches; + + int necaches; /* extra caches (s390) */ + struct cpu_cache *ecaches; + + /* + * All maps are sequentially indexed (0..ncpuspos), the array index + * does not have match with cpuX number as presented by kernel. You + * have to use real_cpu_num() to get the real cpuX number. + * + * For example, the possible system CPUs are: 1,3,5, it means that + * ncpuspos=3, so all arrays are in range 0..3. + */ + int *idx2cpunum; /* mapping index to CPU num */ + + int nnodes; /* number of NUMA modes */ + int *idx2nodenum; /* Support for discontinuous nodes */ + cpu_set_t **nodemaps; /* array with NUMA nodes */ + + /* drawers -- based on drawer_siblings (internal kernel map of cpuX's + * hardware threads within the same drawer */ + int ndrawers; /* number of all online drawers */ + cpu_set_t **drawermaps; /* unique drawer_siblings */ + int *drawerids; /* physical drawer ids */ + + /* books -- based on book_siblings (internal kernel map of cpuX's + * hardware threads within the same book */ + int nbooks; /* number of all online books */ + cpu_set_t **bookmaps; /* unique book_siblings */ + int *bookids; /* physical book ids */ + + /* sockets -- based on core_siblings (internal kernel map of cpuX's + * hardware threads within the same physical_package_id (socket)) */ + int nsockets; /* number of all online sockets */ + cpu_set_t **socketmaps; /* unique core_siblings */ + int *socketids; /* physical socket ids */ + + /* cores -- based on thread_siblings (internal kernel map of cpuX's + * hardware threads within the same core as cpuX) */ + int ncores; /* number of all online cores */ + cpu_set_t **coremaps; /* unique thread_siblings */ + int *coreids; /* physical core ids */ + + int *polarization; /* cpu polarization */ + int *addresses; /* physical cpu addresses */ + int *configured; /* cpu configured */ + int physsockets; /* Physical sockets (modules) */ + int physchips; /* Physical chips */ + int physcoresperchip; /* Physical cores per chip */ +}; + +enum { + OUTPUT_SUMMARY = 0, /* default */ + OUTPUT_PARSABLE, /* -p */ + OUTPUT_READABLE, /* -e */ +}; + +enum { + SYSTEM_LIVE = 0, /* analyzing a live system */ + SYSTEM_SNAPSHOT, /* analyzing a snapshot of a different system */ +}; + +struct lscpu_modifier { + int mode; /* OUTPUT_* */ + int system; /* SYSTEM_* */ + unsigned int hex:1, /* print CPU masks rather than CPU lists */ + compat:1, /* use backwardly compatible format */ + online:1, /* print online CPUs */ + offline:1, /* print offline CPUs */ + json:1, /* JSON output format */ + physical:1; /* use physical numbers */ +}; + +extern int read_hypervisor_dmi(void); +extern void arm_cpu_decode(struct lscpu_desc *desc); + +#endif /* LSCPU_H */ diff --git a/sys-utils/lsipc.1 b/sys-utils/lsipc.1 new file mode 100644 index 0000000..9bb1dce --- /dev/null +++ b/sys-utils/lsipc.1 @@ -0,0 +1,139 @@ +.\" Copyright 2015 Ondrej Oprala(ooprala@redhat.com) +.\" May be distributed under the GNU General Public License +.TH LSIPC "1" "November 2015" "util-linux" "User Commands" +.SH NAME +lsipc \- show information on IPC facilities currently employed in the system +.SH SYNOPSIS +.B lsipc +[options] +.SH DESCRIPTION +.B lsipc +shows information on the inter-process communication facilities +for which the calling process has read access. +.SH OPTIONS +.TP +\fB\-i\fR, \fB\-\-id\fR \fIid\fR +Show full details on just the one resource element identified by +.IR id . +This option needs to be combined with one of the three resource options: +.BR \-m , +.BR \-q " or" +.BR \-s . +It is possible to override the default output format for this option with the +\fB\-\-list\fR, \fB\-\-raw\fR, \fB\-\-json\fR or \fB\-\-export\fR option. +.TP +\fB\-g\fR, \fB\-\-global\fR +Show system-wide usage and limits of IPC resources. +This option may be combined with one of the three resource options: +.BR \-m , +.BR \-q " or" +.BR \-s . +The default is to show information about all resources. +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.SS "Resource options" +.TP +\fB\-m\fR, \fB\-\-shmems\fR +Write information about active shared memory segments. +.TP +\fB\-q\fR, \fB\-\-queues\fR +Write information about active message queues. +.TP +\fB\-s\fR, \fB\-\-semaphores\fR +Write information about active semaphore sets. +.SS "Output formatting" +.TP +\fB\-c\fR, \fB\-\-creator\fR +Show creator and owner. +.TP +\fB\-e\fR, \fB\-\-export\fR +Output data in the format of NAME=VALUE. +.TP +\fB\-J\fR, \fB\-\-json\fR +Use the JSON output format. +.TP +\fB\-l\fR, \fB\-\-list\fR +Use the list output format. This is the default, except when \fB\-\-id\fR +is used. +.TP +\fB\-n\fR, \fB\-\-newline\fR +Display each piece of information on a separate line. +.TP +\fB\-\-noheadings\fR +Do not print a header line. +.TP +\fB\-\-notruncate\fR +Don't truncate output. +.TP +\fB\-o\fR, \fB\-\-output \fIlist\fP +Specify which output columns to print. Use +.B \-\-help +to get a list of all supported columns. +.TP +\fB\-b\fR, \fB\-\-bytes\fR +Print size in bytes rather than in human readable format. +.TP +\fB\-r\fR, \fB\-\-raw\fR +Raw output (no columnation). +.TP +\fB\-t\fR, \fB\-\-time\fR +Write time information. The time of the last control operation that changed +the access permissions for all facilities, the time of the last +.BR msgsnd (2) +and +.BR msgrcv (2) +operations on message queues, the time of the last +.BR shmat (2) +and +.BR shmdt (2) +operations on shared memory, and the time of the last +.BR semop (2) +operation on semaphores. +.TP +\fB\-\-time\-format\fR \fItype\fP +Display dates in short, full or iso format. The default is short, this time +format is designed to be space efficient and human readable. +.TP +\fB\-P\fR, \fB\-\-numeric\-perms\fR +Print numeric permissions in PERMS column. + +.SH EXIT STATUS +.TP +0 +if OK, +.TP +1 +if incorrect arguments specified, +.TP +2 +if a serious error occurs. +.SH SEE ALSO +.BR ipcmk (1), +.BR ipcrm (1), +.BR msgrcv (2), +.BR msgsnd (2), +.BR semget (2), +.BR semop (2), +.BR shmat (2), +.BR shmdt (2), +.BR shmget (2) +.SH HISTORY +The \fBlsipc\fP utility is inspired by the \fBipcs\fP utility. +.SH AUTHORS +.MT ooprala@redhat.com +Ondrej Oprala +.ME +.br +.MT kzak@redhat.com +Karel Zak +.ME + +.SH AVAILABILITY +The lsipc command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/lsipc.c b/sys-utils/lsipc.c new file mode 100644 index 0000000..e8ada57 --- /dev/null +++ b/sys-utils/lsipc.c @@ -0,0 +1,1338 @@ +/* + * lsipc - List information about IPC instances employed in the system + * + * Copyright (C) 2015 Ondrej Oprala <ooprala@redhat.com> + * Copyright (C) 2015 Karel Zak <ooprala@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * + * lsipc is inspired by the ipcs utility. The aim is to create + * a utility unencumbered by a standard to provide more flexible + * means of controlling the output. + */ + +#include <errno.h> +#include <getopt.h> +#include <sys/time.h> +#include <unistd.h> + +#include <libsmartcols.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" +#include "strutils.h" +#include "optutils.h" +#include "xalloc.h" +#include "procutils.h" +#include "ipcutils.h" +#include "timeutils.h" + +/* + * time modes + * */ +enum { + TIME_INVALID = 0, + TIME_SHORT, + TIME_FULL, + TIME_ISO +}; + +/* + * IDs + */ +enum { + /* generic */ + COLDESC_IDX_GEN_FIRST = 0, + COL_KEY = COLDESC_IDX_GEN_FIRST, + COL_ID, + COL_OWNER, + COL_PERMS, + COL_CUID, + COL_CUSER, + COL_CGID, + COL_CGROUP, + COL_UID, + COL_USER, + COL_GID, + COL_GROUP, + COL_CTIME, + COLDESC_IDX_GEN_LAST = COL_CTIME, + + /* msgq-specific */ + COLDESC_IDX_MSG_FIRST, + COL_USEDBYTES = COLDESC_IDX_MSG_FIRST, + COL_MSGS, + COL_SEND, + COL_RECV, + COL_LSPID, + COL_LRPID, + COLDESC_IDX_MSG_LAST = COL_LRPID, + + /* shm-specific */ + COLDESC_IDX_SHM_FIRST, + COL_SIZE = COLDESC_IDX_SHM_FIRST, + COL_NATTCH, + COL_STATUS, + COL_ATTACH, + COL_DETACH, + COL_COMMAND, + COL_CPID, + COL_LPID, + COLDESC_IDX_SHM_LAST = COL_LPID, + + /* sem-specific */ + COLDESC_IDX_SEM_FIRST, + COL_NSEMS = COLDESC_IDX_SEM_FIRST, + COL_OTIME, + COLDESC_IDX_SEM_LAST = COL_OTIME, + + /* summary (--global) */ + COLDESC_IDX_SUM_FIRST, + COL_RESOURCE = COLDESC_IDX_SUM_FIRST, + COL_DESC, + COL_LIMIT, + COL_USED, + COL_USEPERC, + COLDESC_IDX_SUM_LAST = COL_USEPERC +}; + +/* not all columns apply to all options, so we specify a legal range for each */ +static size_t LOWER, UPPER; + +/* + * output modes + */ +enum { + OUT_EXPORT = 1, + OUT_NEWLINE, + OUT_RAW, + OUT_JSON, + OUT_PRETTY, + OUT_LIST +}; + +struct lsipc_control { + int outmode; + unsigned int noheadings : 1, /* don't print header line */ + notrunc : 1, /* don't truncate columns */ + bytes : 1, /* SIZE in bytes */ + numperms : 1, /* numeric permissions */ + time_mode : 2; +}; + +struct lsipc_coldesc { + const char *name; + const char *help; + const char *pretty_name; + + double whint; /* width hint */ + long flag; +}; + +static const struct lsipc_coldesc coldescs[] = +{ + /* common */ + [COL_KEY] = { "KEY", N_("Resource key"), N_("Key"), 1}, + [COL_ID] = { "ID", N_("Resource ID"), N_("ID"), 1}, + [COL_OWNER] = { "OWNER", N_("Owner's username or UID"), N_("Owner"), 1, SCOLS_FL_RIGHT}, + [COL_PERMS] = { "PERMS", N_("Permissions"), N_("Permissions"), 1, SCOLS_FL_RIGHT}, + [COL_CUID] = { "CUID", N_("Creator UID"), N_("Creator UID"), 1, SCOLS_FL_RIGHT}, + [COL_CUSER] = { "CUSER", N_("Creator user"), N_("Creator user"), 1 }, + [COL_CGID] = { "CGID", N_("Creator GID"), N_("Creator GID"), 1, SCOLS_FL_RIGHT}, + [COL_CGROUP] = { "CGROUP", N_("Creator group"), N_("Creator group"), 1 }, + [COL_UID] = { "UID", N_("User ID"), N_("UID"), 1, SCOLS_FL_RIGHT}, + [COL_USER] = { "USER", N_("User name"), N_("User name"), 1}, + [COL_GID] = { "GID", N_("Group ID"), N_("GID"), 1, SCOLS_FL_RIGHT}, + [COL_GROUP] = { "GROUP", N_("Group name"), N_("Group name"), 1}, + [COL_CTIME] = { "CTIME", N_("Time of the last change"), N_("Last change"), 1, SCOLS_FL_RIGHT}, + + /* msgq-specific */ + [COL_USEDBYTES] = { "USEDBYTES",N_("Bytes used"), N_("Bytes used"), 1, SCOLS_FL_RIGHT}, + [COL_MSGS] = { "MSGS", N_("Number of messages"), N_("Messages"), 1}, + [COL_SEND] = { "SEND", N_("Time of last msg sent"), N_("Msg sent"), 1, SCOLS_FL_RIGHT}, + [COL_RECV] = { "RECV", N_("Time of last msg received"), N_("Msg received"), 1, SCOLS_FL_RIGHT}, + [COL_LSPID] = { "LSPID", N_("PID of the last msg sender"), N_("Msg sender"), 1, SCOLS_FL_RIGHT}, + [COL_LRPID] = { "LRPID", N_("PID of the last msg receiver"), N_("Msg receiver"), 1, SCOLS_FL_RIGHT}, + + /* shm-specific */ + [COL_SIZE] = { "SIZE", N_("Segment size"), N_("Segment size"), 1, SCOLS_FL_RIGHT}, + [COL_NATTCH] = { "NATTCH", N_("Number of attached processes"), N_("Attached processes"), 1, SCOLS_FL_RIGHT}, + [COL_STATUS] = { "STATUS", N_("Status"), N_("Status"), 1, SCOLS_FL_NOEXTREMES}, + [COL_ATTACH] = { "ATTACH", N_("Attach time"), N_("Attach time"), 1, SCOLS_FL_RIGHT}, + [COL_DETACH] = { "DETACH", N_("Detach time"), N_("Detach time"), 1, SCOLS_FL_RIGHT}, + [COL_COMMAND] = { "COMMAND", N_("Creator command line"), N_("Creator command"), 0, SCOLS_FL_TRUNC}, + [COL_CPID] = { "CPID", N_("PID of the creator"), N_("Creator PID"), 1, SCOLS_FL_RIGHT}, + [COL_LPID] = { "LPID", N_("PID of last user"), N_("Last user PID"), 1, SCOLS_FL_RIGHT}, + + /* sem-specific */ + [COL_NSEMS] = { "NSEMS", N_("Number of semaphores"), N_("Semaphores"), 1, SCOLS_FL_RIGHT}, + [COL_OTIME] = { "OTIME", N_("Time of the last operation"), N_("Last operation"), 1, SCOLS_FL_RIGHT}, + + /* cols for summarized information */ + [COL_RESOURCE] = { "RESOURCE", N_("Resource name"), N_("Resource"), 1 }, + [COL_DESC] = { "DESCRIPTION",N_("Resource description"), N_("Description"), 1 }, + [COL_USED] = { "USED", N_("Currently used"), N_("Used"), 1, SCOLS_FL_RIGHT }, + [COL_USEPERC] = { "USE%", N_("Currently use percentage"), N_("Use"), 1, SCOLS_FL_RIGHT }, + [COL_LIMIT] = { "LIMIT", N_("System-wide limit"), N_("Limit"), 1, SCOLS_FL_RIGHT }, +}; + + +/* columns[] array specifies all currently wanted output column. The columns + * are defined by coldescs[] array and you can specify (on command line) each + * column twice. That's enough, dynamically allocated array of the columns is + * unnecessary overkill and over-engineering in this case */ +static int columns[ARRAY_SIZE(coldescs) * 2]; +static size_t ncolumns; + +static inline size_t err_columns_index(size_t arysz, size_t idx) +{ + if (idx >= arysz) + errx(EXIT_FAILURE, _("too many columns specified, " + "the limit is %zu columns"), + arysz - 1); + return idx; +} + +#define add_column(ary, n, id) \ + ((ary)[ err_columns_index(ARRAY_SIZE(ary), (n)) ] = (id)) + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(coldescs); i++) { + const char *cn = coldescs[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) { + if (i > COL_CTIME) { + if (i >= LOWER && i <= UPPER) + return i; + else { + warnx(_("column %s does not apply to the specified IPC"), name); + return -1; + } + } else + return i; + } + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static int get_column_id(int num) +{ + assert(num >= 0); + assert((size_t) num < ncolumns); + assert((size_t) columns[num] < ARRAY_SIZE(coldescs)); + return columns[num]; +} + +static const struct lsipc_coldesc *get_column_desc(int num) +{ + return &coldescs[ get_column_id(num) ]; +} + +static char *get_username(struct passwd **pw, uid_t id) +{ + if (!*pw || (*pw)->pw_uid != id) + *pw = getpwuid(id); + + return *pw ? xstrdup((*pw)->pw_name) : NULL; +} + +static char *get_groupname(struct group **gr, gid_t id) +{ + if (!*gr || (*gr)->gr_gid != id) + *gr = getgrgid(id); + + return *gr ? xstrdup((*gr)->gr_name) : NULL; +} + +static int parse_time_mode(const char *s) +{ + struct lsipc_timefmt { + const char *name; + const int val; + }; + static const struct lsipc_timefmt timefmts[] = { + {"iso", TIME_ISO}, + {"full", TIME_FULL}, + {"short", TIME_SHORT}, + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(timefmts); i++) { + if (strcmp(timefmts[i].name, s) == 0) + return timefmts[i].val; + } + errx(EXIT_FAILURE, _("unknown time format: %s"), s); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Show information on IPC facilities.\n"), out); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Resource options:\n"), out); + fputs(_(" -m, --shmems shared memory segments\n"), out); + fputs(_(" -q, --queues message queues\n"), out); + fputs(_(" -s, --semaphores semaphores\n"), out); + fputs(_(" -g, --global info about system-wide usage (may be used with -m, -q and -s)\n"), out); + fputs(_(" -i, --id <id> print details on resource identified by <id>\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" --noheadings don't print headings\n"), out); + fputs(_(" --notruncate don't truncate output\n"), out); + fputs(_(" --time-format=<type> display dates in short, full or iso format\n"), out); + fputs(_(" -b, --bytes print SIZE in bytes rather than in human readable format\n"), out); + fputs(_(" -c, --creator show creator and owner\n"), out); + fputs(_(" -e, --export display in an export-able output format\n"), out); + fputs(_(" -J, --json use the JSON output format\n"), out); + fputs(_(" -n, --newline display each piece of information on a new line\n"), out); + fputs(_(" -l, --list force list output format (for example with --id)\n"), out); + fputs(_(" -o, --output[=<list>] define the columns to output\n"), out); + fputs(_(" -P, --numeric-perms print numeric permissions (PERMS column)\n"), out); + fputs(_(" -r, --raw display in raw mode\n"), out); + fputs(_(" -t, --time show attach, detach and change times\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(26)); + + fprintf(out, _("\nGeneric columns:\n")); + for (i = COLDESC_IDX_GEN_FIRST; i <= COLDESC_IDX_GEN_LAST; i++) + fprintf(out, " %14s %s\n", coldescs[i].name, _(coldescs[i].help)); + + fprintf(out, _("\nShared-memory columns (--shmems):\n")); + for (i = COLDESC_IDX_SHM_FIRST; i <= COLDESC_IDX_SHM_LAST; i++) + fprintf(out, " %14s %s\n", coldescs[i].name, _(coldescs[i].help)); + + fprintf(out, _("\nMessage-queue columns (--queues):\n")); + for (i = COLDESC_IDX_MSG_FIRST; i <= COLDESC_IDX_MSG_LAST; i++) + fprintf(out, " %14s %s\n", coldescs[i].name, _(coldescs[i].help)); + + fprintf(out, _("\nSemaphore columns (--semaphores):\n")); + for (i = COLDESC_IDX_SEM_FIRST; i <= COLDESC_IDX_SEM_LAST; i++) + fprintf(out, " %14s %s\n", coldescs[i].name, _(coldescs[i].help)); + + fprintf(out, _("\nSummary columns (--global):\n")); + for (i = COLDESC_IDX_SUM_FIRST; i <= COLDESC_IDX_SUM_LAST; i++) + fprintf(out, " %14s %s\n", coldescs[i].name, _(coldescs[i].help)); + + printf(USAGE_MAN_TAIL("lsipc(1)")); + exit(EXIT_SUCCESS); +} + +static struct libscols_table *new_table(struct lsipc_control *ctl) +{ + struct libscols_table *table = scols_new_table(); + + if (!table) + err(EXIT_FAILURE, _("failed to allocate output table")); + + if (ctl->noheadings) + scols_table_enable_noheadings(table, 1); + + switch(ctl->outmode) { + case OUT_NEWLINE: + scols_table_set_column_separator(table, "\n"); + /* fallthrough */ + case OUT_EXPORT: + scols_table_enable_export(table, 1); + break; + case OUT_RAW: + scols_table_enable_raw(table, 1); + break; + case OUT_PRETTY: + scols_table_enable_noheadings(table, 1); + break; + case OUT_JSON: + scols_table_enable_json(table, 1); + break; + default: + break; + } + return table; +} + +static struct libscols_table *setup_table(struct lsipc_control *ctl) +{ + struct libscols_table *table = new_table(ctl); + size_t n; + + for (n = 0; n < ncolumns; n++) { + const struct lsipc_coldesc *desc = get_column_desc(n); + int flags = desc->flag; + + if (ctl->notrunc) + flags &= ~SCOLS_FL_TRUNC; + if (!scols_table_new_column(table, desc->name, desc->whint, flags)) + goto fail; + } + return table; +fail: + scols_unref_table(table); + return NULL; +} + +static int print_pretty(struct libscols_table *table) +{ + struct libscols_iter *itr = scols_new_iter(SCOLS_ITER_FORWARD); + struct libscols_column *col; + struct libscols_cell *data; + struct libscols_line *ln; + const char *hstr, *dstr; + int n = 0; + + ln = scols_table_get_line(table, 0); + while (!scols_table_next_column(table, itr, &col)) { + + data = scols_line_get_cell(ln, n); + + hstr = N_(get_column_desc(n)->pretty_name); + dstr = scols_cell_get_data(data); + + if (dstr) + printf("%s:%*c%-36s\n", hstr, 35 - (int)strlen(hstr), ' ', dstr); + ++n; + } + + /* this is used to pretty-print detailed info about a semaphore array */ + if (ln) { + struct libscols_table *subtab = scols_line_get_userdata(ln); + if (subtab) { + printf(_("Elements:\n\n")); + scols_print_table(subtab); + } + } + + scols_free_iter(itr); + return 0; + +} + +static int print_table(struct lsipc_control *ctl, struct libscols_table *tb) +{ + if (ctl->outmode == OUT_PRETTY) + print_pretty(tb); + else + scols_print_table(tb); + return 0; +} +static struct timeval now; + +static char *make_time(int mode, time_t time) +{ + char buf[64] = {0}; + + switch(mode) { + case TIME_FULL: + { + struct tm tm; + char *s; + + localtime_r(&time, &tm); + asctime_r(&tm, buf); + if (*(s = buf + strlen(buf) - 1) == '\n') + *s = '\0'; + break; + } + case TIME_SHORT: + strtime_short(&time, &now, 0, buf, sizeof(buf)); + break; + case TIME_ISO: + strtime_iso(&time, ISO_TIMESTAMP_T, buf, sizeof(buf)); + break; + default: + errx(EXIT_FAILURE, _("unsupported time type")); + } + return xstrdup(buf); +} + +static void global_set_data(struct libscols_table *tb, const char *resource, + const char *desc, uintmax_t used, uintmax_t limit, int usage) +{ + struct libscols_line *ln; + size_t n; + + ln = scols_table_new_line(tb, NULL); + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (n = 0; n < ncolumns; n++) { + int rc = 0; + char *arg = NULL; + + switch (get_column_id(n)) { + case COL_RESOURCE: + rc = scols_line_set_data(ln, n, resource); + break; + case COL_DESC: + rc = scols_line_set_data(ln, n, desc); + break; + case COL_USED: + if (usage) { + xasprintf(&arg, "%ju", used); + rc = scols_line_refer_data(ln, n, arg); + } else + rc = scols_line_set_data(ln, n, "-"); + break; + case COL_USEPERC: + if (usage) { + xasprintf(&arg, "%2.2f%%", (double) used / limit * 100); + rc = scols_line_refer_data(ln, n, arg); + } else + rc = scols_line_set_data(ln, n, "-"); + break; + case COL_LIMIT: + xasprintf(&arg, "%ju", limit); + rc = scols_line_refer_data(ln, n, arg); + break; + } + + if (rc != 0) + err(EXIT_FAILURE, _("failed to add output data")); + } +} + +static void setup_sem_elements_columns(struct libscols_table *tb) +{ + scols_table_set_name(tb, "elements"); + if (!scols_table_new_column(tb, "SEMNUM", 0, SCOLS_FL_RIGHT)) + err_oom(); + if (!scols_table_new_column(tb, "VALUE", 0, SCOLS_FL_RIGHT)) + err_oom(); + if (!scols_table_new_column(tb, "NCOUNT", 0, SCOLS_FL_RIGHT)) + err_oom(); + if (!scols_table_new_column(tb, "ZCOUNT", 0, SCOLS_FL_RIGHT)) + err_oom(); + if (!scols_table_new_column(tb, "PID", 0, SCOLS_FL_RIGHT)) + err_oom(); + if (!scols_table_new_column(tb, "COMMAND", 0, SCOLS_FL_RIGHT)) + err_oom(); +} + +static void do_sem(int id, struct lsipc_control *ctl, struct libscols_table *tb) +{ + struct libscols_line *ln; + struct passwd *pw = NULL, *cpw = NULL; + struct group *gr = NULL, *cgr = NULL; + struct sem_data *semds, *semdsp; + char *arg = NULL; + + scols_table_set_name(tb, "semaphores"); + + if (ipc_sem_get_info(id, &semds) < 1) { + if (id > -1) + warnx(_("id %d not found"), id); + return; + } + for (semdsp = semds; semdsp->next != NULL || id > -1; semdsp = semdsp->next) { + size_t n; + + ln = scols_table_new_line(tb, NULL); + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (n = 0; n < ncolumns; n++) { + int rc = 0; + switch (get_column_id(n)) { + case COL_KEY: + xasprintf(&arg, "0x%08x",semdsp->sem_perm.key); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_ID: + xasprintf(&arg, "%d",semdsp->sem_perm.id); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_OWNER: + arg = get_username(&pw, semdsp->sem_perm.uid); + if (!arg) + xasprintf(&arg, "%u", semdsp->sem_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_PERMS: + if (ctl->numperms) + xasprintf(&arg, "%#o", semdsp->sem_perm.mode & 0777); + else { + arg = xmalloc(11); + xstrmode(semdsp->sem_perm.mode & 0777, arg); + } + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CUID: + xasprintf(&arg, "%u", semdsp->sem_perm.cuid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CUSER: + arg = get_username(&cpw, semdsp->sem_perm.cuid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGID: + xasprintf(&arg, "%u", semdsp->sem_perm.cgid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGROUP: + arg = get_groupname(&cgr, semdsp->sem_perm.cgid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_UID: + xasprintf(&arg, "%u", semdsp->sem_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_USER: + arg = get_username(&pw, semdsp->sem_perm.uid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GID: + xasprintf(&arg, "%u", semdsp->sem_perm.gid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GROUP: + arg = get_groupname(&gr, semdsp->sem_perm.gid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CTIME: + if (semdsp->sem_ctime != 0) { + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)semdsp->sem_ctime)); + } + break; + case COL_NSEMS: + xasprintf(&arg, "%ju", semdsp->sem_nsems); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_OTIME: + if (semdsp->sem_otime != 0) { + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)semdsp->sem_otime)); + } + break; + } + if (rc != 0) + err(EXIT_FAILURE, _("failed to add output data")); + arg = NULL; + } + + if (id > -1 && semds->sem_nsems) { + /* Create extra table with ID specific semaphore elements */ + struct libscols_table *sub = new_table(ctl); + size_t i; + int rc = 0; + + scols_table_enable_noheadings(sub, 0); + setup_sem_elements_columns(sub); + + for (i = 0; i < semds->sem_nsems; i++) { + struct sem_elem *e = &semds->elements[i]; + struct libscols_line *sln = scols_table_new_line(sub, NULL); + + if (!sln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + /* SEMNUM */ + xasprintf(&arg, "%zu", i); + rc = scols_line_refer_data(sln, 0, arg); + if (rc) + break; + + /* VALUE */ + xasprintf(&arg, "%d", e->semval); + rc = scols_line_refer_data(sln, 1, arg); + if (rc) + break; + + /* NCOUNT */ + xasprintf(&arg, "%d", e->ncount); + rc = scols_line_refer_data(sln, 2, arg); + if (rc) + break; + + /* ZCOUNT */ + xasprintf(&arg, "%d", e->zcount); + rc = scols_line_refer_data(sln, 3, arg); + if (rc) + break; + + /* PID */ + xasprintf(&arg, "%d", e->pid); + rc = scols_line_refer_data(sln, 4, arg); + if (rc) + break; + + /* COMMAND */ + arg = proc_get_command(e->pid); + rc = scols_line_refer_data(sln, 5, arg); + if (rc) + break; + } + + if (rc != 0) + err(EXIT_FAILURE, _("failed to set data")); + + scols_line_set_userdata(ln, (void *)sub); + break; + } + } + ipc_sem_free_info(semds); +} + +static void do_sem_global(struct libscols_table *tb) +{ + struct sem_data *semds, *semdsp; + struct ipc_limits lim; + int nsems = 0, nsets = 0; + + ipc_sem_get_limits(&lim); + + if (ipc_sem_get_info(-1, &semds) > 0) { + for (semdsp = semds; semdsp->next != NULL; semdsp = semdsp->next) { + ++nsets; + nsems += semds->sem_nsems; + } + ipc_sem_free_info(semds); + } + + global_set_data(tb, "SEMMNI", _("Number of semaphore identifiers"), nsets, lim.semmni, 1); + global_set_data(tb, "SEMMNS", _("Total number of semaphores"), nsems, lim.semmns, 1); + global_set_data(tb, "SEMMSL", _("Max semaphores per semaphore set."), 0, lim.semmsl, 0); + global_set_data(tb, "SEMOPM", _("Max number of operations per semop(2)"), 0, lim.semopm, 0); + global_set_data(tb, "SEMVMX", _("Semaphore max value"), 0, lim.semvmx, 0); +} + +static void do_msg(int id, struct lsipc_control *ctl, struct libscols_table *tb) +{ + struct libscols_line *ln; + struct passwd *pw = NULL; + struct group *gr = NULL; + struct msg_data *msgds, *msgdsp; + char *arg = NULL; + + if (ipc_msg_get_info(id, &msgds) < 1) { + if (id > -1) + warnx(_("id %d not found"), id); + return; + } + scols_table_set_name(tb, "messages"); + + for (msgdsp = msgds; msgdsp->next != NULL || id > -1 ; msgdsp = msgdsp->next) { + size_t n; + ln = scols_table_new_line(tb, NULL); + + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + /* no need to call getpwuid() for the same user */ + if (!(pw && pw->pw_uid == msgdsp->msg_perm.uid)) + pw = getpwuid(msgdsp->msg_perm.uid); + + /* no need to call getgrgid() for the same user */ + if (!(gr && gr->gr_gid == msgdsp->msg_perm.gid)) + gr = getgrgid(msgdsp->msg_perm.gid); + + for (n = 0; n < ncolumns; n++) { + int rc = 0; + + switch (get_column_id(n)) { + case COL_KEY: + xasprintf(&arg, "0x%08x",msgdsp->msg_perm.key); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_ID: + xasprintf(&arg, "%d",msgdsp->msg_perm.id); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_OWNER: + arg = get_username(&pw, msgdsp->msg_perm.uid); + if (!arg) + xasprintf(&arg, "%u", msgdsp->msg_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_PERMS: + if (ctl->numperms) + xasprintf(&arg, "%#o", msgdsp->msg_perm.mode & 0777); + else { + arg = xmalloc(11); + xstrmode(msgdsp->msg_perm.mode & 0777, arg); + rc = scols_line_refer_data(ln, n, arg); + } + break; + case COL_CUID: + xasprintf(&arg, "%u", msgdsp->msg_perm.cuid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CUSER: + arg = get_username(&pw, msgdsp->msg_perm.cuid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGID: + xasprintf(&arg, "%u", msgdsp->msg_perm.cuid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGROUP: + arg = get_groupname(&gr, msgdsp->msg_perm.cgid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_UID: + xasprintf(&arg, "%u", msgdsp->msg_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_USER: + arg = get_username(&pw, msgdsp->msg_perm.uid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GID: + xasprintf(&arg, "%u", msgdsp->msg_perm.gid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GROUP: + arg = get_groupname(&gr,msgdsp->msg_perm.gid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CTIME: + if (msgdsp->q_ctime != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)msgdsp->q_ctime)); + break; + case COL_USEDBYTES: + xasprintf(&arg, "%ju", msgdsp->q_cbytes); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_MSGS: + xasprintf(&arg, "%ju", msgdsp->q_qnum); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_SEND: + if (msgdsp->q_stime != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)msgdsp->q_stime)); + break; + case COL_RECV: + if (msgdsp->q_rtime != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)msgdsp->q_rtime)); + break; + case COL_LSPID: + xasprintf(&arg, "%u", msgdsp->q_lspid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_LRPID: + xasprintf(&arg, "%u", msgdsp->q_lrpid); + rc = scols_line_refer_data(ln, n, arg); + break; + } + if (rc != 0) + err(EXIT_FAILURE, _("failed to set data")); + arg = NULL; + } + if (id > -1) + break; + } + ipc_msg_free_info(msgds); +} + + +static void do_msg_global(struct libscols_table *tb) +{ + struct msg_data *msgds, *msgdsp; + struct ipc_limits lim; + int msgqs = 0; + + ipc_msg_get_limits(&lim); + + /* count number of used queues */ + if (ipc_msg_get_info(-1, &msgds) > 0) { + for (msgdsp = msgds; msgdsp->next != NULL; msgdsp = msgdsp->next) + ++msgqs; + ipc_msg_free_info(msgds); + } + + global_set_data(tb, "MSGMNI", _("Number of message queues"), msgqs, lim.msgmni, 1); + global_set_data(tb, "MSGMAX", _("Max size of message (bytes)"), 0, lim.msgmax, 0); + global_set_data(tb, "MSGMNB", _("Default max size of queue (bytes)"), 0, lim.msgmnb, 0); +} + + +static void do_shm(int id, struct lsipc_control *ctl, struct libscols_table *tb) +{ + struct libscols_line *ln; + struct passwd *pw = NULL; + struct group *gr = NULL; + struct shm_data *shmds, *shmdsp; + char *arg = NULL; + + if (ipc_shm_get_info(id, &shmds) < 1) { + if (id > -1) + warnx(_("id %d not found"), id); + return; + } + + scols_table_set_name(tb, "sharedmemory"); + + for (shmdsp = shmds; shmdsp->next != NULL || id > -1 ; shmdsp = shmdsp->next) { + size_t n; + ln = scols_table_new_line(tb, NULL); + + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (n = 0; n < ncolumns; n++) { + int rc = 0; + + switch (get_column_id(n)) { + case COL_KEY: + xasprintf(&arg, "0x%08x",shmdsp->shm_perm.key); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_ID: + xasprintf(&arg, "%d",shmdsp->shm_perm.id); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_OWNER: + arg = get_username(&pw, shmdsp->shm_perm.uid); + if (!arg) + xasprintf(&arg, "%u", shmdsp->shm_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_PERMS: + if (ctl->numperms) + xasprintf(&arg, "%#o", shmdsp->shm_perm.mode & 0777); + else { + arg = xmalloc(11); + xstrmode(shmdsp->shm_perm.mode & 0777, arg); + } + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CUID: + xasprintf(&arg, "%u", shmdsp->shm_perm.cuid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CUSER: + arg = get_username(&pw, shmdsp->shm_perm.cuid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGID: + xasprintf(&arg, "%u", shmdsp->shm_perm.cuid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CGROUP: + arg = get_groupname(&gr, shmdsp->shm_perm.cgid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_UID: + xasprintf(&arg, "%u", shmdsp->shm_perm.uid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_USER: + arg = get_username(&pw, shmdsp->shm_perm.uid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GID: + xasprintf(&arg, "%u", shmdsp->shm_perm.gid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_GROUP: + arg = get_groupname(&gr, shmdsp->shm_perm.gid); + if (arg) + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_CTIME: + if (shmdsp->shm_ctim != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)shmdsp->shm_ctim)); + break; + case COL_SIZE: + if (ctl->bytes) + xasprintf(&arg, "%ju", shmdsp->shm_segsz); + else + arg = size_to_human_string(SIZE_SUFFIX_1LETTER, shmdsp->shm_segsz); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_NATTCH: + xasprintf(&arg, "%ju", shmdsp->shm_nattch); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_STATUS: { + int comma = 0; + size_t offt = 0; + + free(arg); + arg = xcalloc(1, sizeof(char) * strlen(_("dest")) + + strlen(_("locked")) + + strlen(_("hugetlb")) + + strlen(_("noreserve")) + 4); +#ifdef SHM_DEST + if (shmdsp->shm_perm.mode & SHM_DEST) { + offt += sprintf(arg, "%s", _("dest")); + comma++; + } +#endif +#ifdef SHM_LOCKED + if (shmdsp->shm_perm.mode & SHM_LOCKED) { + if (comma) + arg[offt++] = ','; + offt += sprintf(arg + offt, "%s", _("locked")); + } +#endif +#ifdef SHM_HUGETLB + if (shmdsp->shm_perm.mode & SHM_HUGETLB) { + if (comma) + arg[offt++] = ','; + offt += sprintf(arg + offt, "%s", _("hugetlb")); + } +#endif +#ifdef SHM_NORESERVE + if (shmdsp->shm_perm.mode & SHM_NORESERVE) { + if (comma) + arg[offt++] = ','; + sprintf(arg + offt, "%s", _("noreserve")); + } +#endif + rc = scols_line_refer_data(ln, n, arg); + } + break; + case COL_ATTACH: + if (shmdsp->shm_atim != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)shmdsp->shm_atim)); + break; + case COL_DETACH: + if (shmdsp->shm_dtim != 0) + rc = scols_line_refer_data(ln, n, + make_time(ctl->time_mode, + (time_t)shmdsp->shm_dtim)); + break; + case COL_CPID: + xasprintf(&arg, "%u", shmdsp->shm_cprid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_LPID: + xasprintf(&arg, "%u", shmdsp->shm_lprid); + rc = scols_line_refer_data(ln, n, arg); + break; + case COL_COMMAND: + arg = proc_get_command(shmdsp->shm_cprid); + rc = scols_line_refer_data(ln, n, arg); + break; + } + if (rc != 0) + err(EXIT_FAILURE, _("failed to set data")); + arg = NULL; + } + if (id > -1) + break; + } + ipc_shm_free_info(shmds); +} + +static void do_shm_global(struct libscols_table *tb) +{ + struct shm_data *shmds, *shmdsp; + uint64_t nsegs = 0, sum_segsz = 0; + struct ipc_limits lim; + + ipc_shm_get_limits(&lim); + + if (ipc_shm_get_info(-1, &shmds) > 0) { + for (shmdsp = shmds; shmdsp->next != NULL; shmdsp = shmdsp->next) { + ++nsegs; + sum_segsz += shmdsp->shm_segsz; + } + ipc_shm_free_info(shmds); + } + + global_set_data(tb, "SHMMNI", _("Shared memory segments"), nsegs, lim.shmmni, 1); + global_set_data(tb, "SHMALL", _("Shared memory pages"), sum_segsz / getpagesize(), lim.shmall, 1); + global_set_data(tb, "SHMMAX", _("Max size of shared memory segment (bytes)"), 0, lim.shmmax, 0); + global_set_data(tb, "SHMMIN", _("Min size of shared memory segment (bytes)"), 0, lim.shmmin, 0); +} + +int main(int argc, char *argv[]) +{ + int opt, msg = 0, sem = 0, shm = 0, id = -1; + int show_time = 0, show_creat = 0, global = 0; + size_t i; + struct lsipc_control *ctl = xcalloc(1, sizeof(struct lsipc_control)); + static struct libscols_table *tb; + char *outarg = NULL; + + /* long only options. */ + enum { + OPT_NOTRUNC = CHAR_MAX + 1, + OPT_NOHEAD, + OPT_TIME_FMT + }; + + static const struct option longopts[] = { + { "bytes", no_argument, NULL, 'b' }, + { "creator", no_argument, NULL, 'c' }, + { "export", no_argument, NULL, 'e' }, + { "global", no_argument, NULL, 'g' }, + { "help", no_argument, NULL, 'h' }, + { "id", required_argument, NULL, 'i' }, + { "json", no_argument, NULL, 'J' }, + { "list", no_argument, NULL, 'l' }, + { "newline", no_argument, NULL, 'n' }, + { "noheadings", no_argument, NULL, OPT_NOHEAD }, + { "notruncate", no_argument, NULL, OPT_NOTRUNC }, + { "numeric-perms", no_argument, NULL, 'P' }, + { "output", required_argument, NULL, 'o' }, + { "queues", no_argument, NULL, 'q' }, + { "raw", no_argument, NULL, 'r' }, + { "semaphores", no_argument, NULL, 's' }, + { "shmems", no_argument, NULL, 'm' }, + { "time", no_argument, NULL, 't' }, + { "time-format", required_argument, NULL, OPT_TIME_FMT }, + { "version", no_argument, NULL, 'V' }, + {NULL, 0, NULL, 0} + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'J', 'e', 'l', 'n', 'r' }, + { 'g', 'i' }, + { 'c', 'o', 't' }, + { 'm', 'q', 's' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + ctl->time_mode = 0; + + scols_init_debug(0); + + while ((opt = getopt_long(argc, argv, "bceghi:Jlmno:PqrstV", longopts, NULL)) != -1) { + + err_exclusive_options(opt, longopts, excl, excl_st); + + switch (opt) { + case 'b': + ctl->bytes = 1; + break; + case 'i': + id = strtos32_or_err(optarg, _("failed to parse IPC identifier")); + break; + case 'e': + ctl->outmode = OUT_EXPORT; + break; + case 'r': + ctl->outmode = OUT_RAW; + break; + case 'o': + outarg = optarg; + break; + case 'g': + global = 1; + break; + case 'q': + msg = 1; + add_column(columns, ncolumns++, COL_KEY); + add_column(columns, ncolumns++, COL_ID); + add_column(columns, ncolumns++, COL_PERMS); + add_column(columns, ncolumns++, COL_OWNER); + add_column(columns, ncolumns++, COL_USEDBYTES); + add_column(columns, ncolumns++, COL_MSGS); + add_column(columns, ncolumns++, COL_LSPID); + add_column(columns, ncolumns++, COL_LRPID); + LOWER = COLDESC_IDX_MSG_FIRST; + UPPER = COLDESC_IDX_MSG_LAST; + break; + case 'l': + ctl->outmode = OUT_LIST; + break; + case 'm': + shm = 1; + add_column(columns, ncolumns++, COL_KEY); + add_column(columns, ncolumns++, COL_ID); + add_column(columns, ncolumns++, COL_PERMS); + add_column(columns, ncolumns++, COL_OWNER); + add_column(columns, ncolumns++, COL_SIZE); + add_column(columns, ncolumns++, COL_NATTCH); + add_column(columns, ncolumns++, COL_STATUS); + add_column(columns, ncolumns++, COL_CTIME); + add_column(columns, ncolumns++, COL_CPID); + add_column(columns, ncolumns++, COL_LPID); + add_column(columns, ncolumns++, COL_COMMAND); + LOWER = COLDESC_IDX_SHM_FIRST; + UPPER = COLDESC_IDX_SHM_LAST; + break; + case 'n': + ctl->outmode = OUT_NEWLINE; + break; + case 'P': + ctl->numperms = 1; + break; + case 's': + sem = 1; + add_column(columns, ncolumns++, COL_KEY); + add_column(columns, ncolumns++, COL_ID); + add_column(columns, ncolumns++, COL_PERMS); + add_column(columns, ncolumns++, COL_OWNER); + add_column(columns, ncolumns++, COL_NSEMS); + LOWER = COLDESC_IDX_SEM_FIRST; + UPPER = COLDESC_IDX_SEM_LAST; + break; + case OPT_NOTRUNC: + ctl->notrunc = 1; + break; + case OPT_NOHEAD: + ctl->noheadings = 1; + break; + case OPT_TIME_FMT: + ctl->time_mode = parse_time_mode(optarg); + break; + case 'J': + ctl->outmode = OUT_JSON; + break; + case 't': + show_time = 1; + break; + case 'c': + show_creat = 1; + break; + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + /* default is global */ + if (msg + shm + sem == 0) { + msg = shm = sem = global = 1; + if (show_time || show_creat || id != -1) + errx(EXIT_FAILURE, _("--global is mutually exclusive with --creator, --id and --time")); + } + if (global) { + add_column(columns, ncolumns++, COL_RESOURCE); + add_column(columns, ncolumns++, COL_DESC); + add_column(columns, ncolumns++, COL_LIMIT); + add_column(columns, ncolumns++, COL_USED); + add_column(columns, ncolumns++, COL_USEPERC); + LOWER = COLDESC_IDX_SUM_FIRST; + UPPER = COLDESC_IDX_SUM_LAST; + } + + /* default to pretty-print if --id specified */ + if (id != -1 && !ctl->outmode) + ctl->outmode = OUT_PRETTY; + + if (!ctl->time_mode) + ctl->time_mode = ctl->outmode == OUT_PRETTY ? TIME_FULL : TIME_SHORT; + + if (ctl->outmode == OUT_PRETTY && !(optarg || show_creat || show_time)) { + /* all columns for lsipc --<RESOURCE> --id <ID> */ + for (ncolumns = 0, i = 0; i < ARRAY_SIZE(coldescs); i++) + columns[ncolumns++] = i; + } else { + if (show_creat) { + add_column(columns, ncolumns++, COL_CUID); + add_column(columns, ncolumns++, COL_CGID); + add_column(columns, ncolumns++, COL_UID); + add_column(columns, ncolumns++, COL_GID); + } + if (msg && show_time) { + add_column(columns, ncolumns++, COL_SEND); + add_column(columns, ncolumns++, COL_RECV); + add_column(columns, ncolumns++, COL_CTIME); + } + if (shm && show_time) { + /* keep "COMMAND" as last column */ + size_t cmd = columns[ncolumns - 1] == COL_COMMAND; + + if (cmd) + ncolumns--; + add_column(columns, ncolumns++, COL_ATTACH); + add_column(columns, ncolumns++, COL_DETACH); + if (cmd) + add_column(columns, ncolumns++, COL_COMMAND); + } + if (sem && show_time) { + add_column(columns, ncolumns++, COL_OTIME); + add_column(columns, ncolumns++, COL_CTIME); + } + } + + if (outarg && string_add_to_idarray(outarg, columns, ARRAY_SIZE(columns), + &ncolumns, column_name_to_id) < 0) + return EXIT_FAILURE; + + tb = setup_table(ctl); + if (!tb) + return EXIT_FAILURE; + + if (global) + scols_table_set_name(tb, "ipclimits"); + + if (msg) { + if (global) + do_msg_global(tb); + else + do_msg(id, ctl, tb); + } + if (shm) { + if (global) + do_shm_global(tb); + else + do_shm(id, ctl, tb); + } + if (sem) { + if (global) + do_sem_global(tb); + else + do_sem(id, ctl, tb); + } + + print_table(ctl, tb); + + scols_unref_table(tb); + free(ctl); + + return EXIT_SUCCESS; +} + diff --git a/sys-utils/lsmem.1 b/sys-utils/lsmem.1 new file mode 100644 index 0000000..4476d3e --- /dev/null +++ b/sys-utils/lsmem.1 @@ -0,0 +1,99 @@ +.TH LSMEM 1 "October 2016" "util-linux" "User Commands" +.SH NAME +lsmem \- list the ranges of available memory with their online status +.SH SYNOPSIS +.B lsmem +[options] +.SH DESCRIPTION +The \fBlsmem\fP command lists the ranges of available memory with their online +status. The listed memory blocks correspond to the memory block representation +in sysfs. The command also shows the memory block size and the amount of memory +in online and offline state. + +The default output compatible with original implementation from s390-tools, but +it's strongly recommended to avoid using default outputs in your scripts. +Always explicitly define expected columns by using the \fB\-\-output\fR option +together with a columns list in environments where a stable output is required. + +The \fBlsmem\fP command lists a new memory range always when the current memory +block distinguish from the previous block by some output column. This default +behavior is possible to override by the \fB\-\-split\fR option (e.g. \fBlsmem +\-\-split=ZONES\fR). The special word "none" may be used to ignore all +differences between memory blocks and to create as large as possible continuous +ranges. The opposite semantic is \fB\-\-all\fR to list individual memory +blocks. + +Note that some output columns may provide inaccurate information if a split policy +forces \fBlsmem\fP to ignore differences in some attributes. For example if you +merge removable and non-removable memory blocks to the one range than all +the range will be marked as non-removable on \fBlsmem\fP output. + +Not all columns are supported on all systems. If an unsupported column is +specified, \fBlsmem\fP prints the column but does not provide any data for it. + +Use the \fB\-\-help\fR option to see the columns description. + +.SH OPTIONS +.TP +.BR \-a ", " \-\-all +List each individual memory block, instead of combining memory blocks with +similar attributes. +.TP +.BR \-b , " \-\-bytes" +Print the SIZE column in bytes rather than in a human-readable format. +.TP +.BR \-h ", " \-\-help +Display help text and exit. +.TP +.BR \-J , " \-\-json" +Use JSON output format. +.TP +.BR \-n , " \-\-noheadings" +Do not print a header line. +.TP +.BR \-o , " \-\-output " \fIlist\fP +Specify which output columns to print. Use \fB\-\-help\fR +to get a list of all supported columns. +The default list of columns may be extended if \fIlist\fP is +specified in the format \fB+\fIlist\fP (e.g. \fBlsmem \-o +NODE\fP). +.TP +.B \-\-output\-all +Output all available columns. +.TP +.BR \-P , " \-\-pairs" +Produce output in the form of key="value" pairs. +All potentially unsafe characters are hex-escaped (\\x<code>). +.TP +.BR \-r , " \-\-raw" +Produce output in raw format. All potentially unsafe characters are hex-escaped +(\\x<code>). +.TP +.BR \-S , " \-\-split " \fIlist\fP +Specify which columns (attributes) use to split memory blocks to ranges. The +supported columns are STATE, REMOVABLE, NODE and ZONES, or "none". The another columns are +silently ignored. For more details see DESCRIPTION above. +.TP +.BR \-s , " \-\-sysroot " \fIdirectory\fP +Gather memory data for a Linux instance other than the instance from which the +\fBlsmem\fP command is issued. The specified \fIdirectory\fP is the system +root of the Linux instance to be inspected. +.TP +.BR \-V ", " \-\-version +Display version information and exit. +.TP +\fB\-\-summary\fR[=\fIwhen\fR] +This option controls summary lines output. The optional argument \fIwhen\fP can be +\fBnever\fR, \fBalways\fR or \fBonly\fR. If the \fIwhen\fR argument is +omitted, it defaults to \fB"only"\fR. The summary output is suppressed for +\fB\-\-raw\fR, \fB\-\-pairs\fR and \fB\-\-json\fR. +.SH AUTHOR +.B lsmem +was originally written by Gerald Schaefer for s390-tools in Perl. The C version +for util-linux was written by Clemens von Mann, Heiko Carstens and Karel Zak. +.SH SEE ALSO +.BR chmem (8) +.SH AVAILABILITY +The \fBlsmem\fP command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/lsmem.c b/sys-utils/lsmem.c new file mode 100644 index 0000000..8638336 --- /dev/null +++ b/sys-utils/lsmem.c @@ -0,0 +1,747 @@ +/* + * lsmem - Show memory configuration + * + * Copyright IBM Corp. 2016 + * Copyright (C) 2016 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <c.h> +#include <nls.h> +#include <path.h> +#include <strutils.h> +#include <closestream.h> +#include <xalloc.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <dirent.h> +#include <fcntl.h> +#include <inttypes.h> +#include <assert.h> +#include <optutils.h> +#include <libsmartcols.h> + +#define _PATH_SYS_MEMORY "/sys/devices/system/memory" + +#define MEMORY_STATE_ONLINE 0 +#define MEMORY_STATE_OFFLINE 1 +#define MEMORY_STATE_GOING_OFFLINE 2 +#define MEMORY_STATE_UNKNOWN 3 + +enum zone_id { + ZONE_DMA = 0, + ZONE_DMA32, + ZONE_NORMAL, + ZONE_HIGHMEM, + ZONE_MOVABLE, + ZONE_DEVICE, + ZONE_NONE, + ZONE_UNKNOWN, + MAX_NR_ZONES, +}; + +struct memory_block { + uint64_t index; + uint64_t count; + int state; + int node; + int nr_zones; + int zones[MAX_NR_ZONES]; + unsigned int removable:1; +}; + +struct lsmem { + struct path_cxt *sysmem; /* _PATH_SYS_MEMORY directory handler */ + struct dirent **dirs; + int ndirs; + struct memory_block *blocks; + int nblocks; + uint64_t block_size; + uint64_t mem_online; + uint64_t mem_offline; + + struct libscols_table *table; + unsigned int have_nodes : 1, + raw : 1, + export : 1, + json : 1, + noheadings : 1, + summary : 1, + list_all : 1, + bytes : 1, + want_summary : 1, + want_table : 1, + split_by_node : 1, + split_by_state : 1, + split_by_removable : 1, + split_by_zones : 1, + have_zones : 1; +}; + + +enum { + COL_RANGE, + COL_SIZE, + COL_STATE, + COL_REMOVABLE, + COL_BLOCK, + COL_NODE, + COL_ZONES, +}; + +static char *zone_names[] = { + [ZONE_DMA] = "DMA", + [ZONE_DMA32] = "DMA32", + [ZONE_NORMAL] = "Normal", + [ZONE_HIGHMEM] = "Highmem", + [ZONE_MOVABLE] = "Movable", + [ZONE_DEVICE] = "Device", + [ZONE_NONE] = "None", /* block contains more than one zone, can't be offlined */ + [ZONE_UNKNOWN] = "Unknown", +}; + +/* column names */ +struct coldesc { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; +}; + +/* columns descriptions */ +static struct coldesc coldescs[] = { + [COL_RANGE] = { "RANGE", 0, 0, N_("start and end address of the memory range")}, + [COL_SIZE] = { "SIZE", 5, SCOLS_FL_RIGHT, N_("size of the memory range")}, + [COL_STATE] = { "STATE", 0, SCOLS_FL_RIGHT, N_("online status of the memory range")}, + [COL_REMOVABLE] = { "REMOVABLE", 0, SCOLS_FL_RIGHT, N_("memory is removable")}, + [COL_BLOCK] = { "BLOCK", 0, SCOLS_FL_RIGHT, N_("memory block number or blocks range")}, + [COL_NODE] = { "NODE", 0, SCOLS_FL_RIGHT, N_("numa node of memory")}, + [COL_ZONES] = { "ZONES", 0, SCOLS_FL_RIGHT, N_("valid zones for the memory range")}, +}; + +/* columns[] array specifies all currently wanted output column. The columns + * are defined by coldescs[] array and you can specify (on command line) each + * column twice. That's enough, dynamically allocated array of the columns is + * unnecessary overkill and over-engineering in this case */ +static int columns[ARRAY_SIZE(coldescs) * 2]; +static size_t ncolumns; + +static inline size_t err_columns_index(size_t arysz, size_t idx) +{ + if (idx >= arysz) + errx(EXIT_FAILURE, _("too many columns specified, " + "the limit is %zu columns"), + arysz - 1); + return idx; +} + +/* + * name must be null-terminated + */ +static int zone_name_to_id(const char *name) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(zone_names); i++) { + if (!strcasecmp(name, zone_names[i])) + return i; + } + return ZONE_UNKNOWN; +} + +#define add_column(ary, n, id) \ + ((ary)[ err_columns_index(ARRAY_SIZE(ary), (n)) ] = (id)) + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(coldescs); i++) { + const char *cn = coldescs[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static inline int get_column_id(int num) +{ + assert(num >= 0); + assert((size_t) num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(coldescs)); + + return columns[num]; +} + +static inline struct coldesc *get_column_desc(int num) +{ + return &coldescs[ get_column_id(num) ]; +} + +static inline void reset_split_policy(struct lsmem *l, int enable) +{ + l->split_by_state = enable; + l->split_by_node = enable; + l->split_by_removable = enable; + l->split_by_zones = enable; +} + +static void set_split_policy(struct lsmem *l, int cols[], size_t ncols) +{ + size_t i; + + reset_split_policy(l, 0); + + for (i = 0; i < ncols; i++) { + switch (cols[i]) { + case COL_STATE: + l->split_by_state = 1; + break; + case COL_NODE: + l->split_by_node = 1; + break; + case COL_REMOVABLE: + l->split_by_removable = 1; + break; + case COL_ZONES: + l->split_by_zones = 1; + break; + default: + break; + } + } +} + +static void add_scols_line(struct lsmem *lsmem, struct memory_block *blk) +{ + size_t i; + struct libscols_line *line; + + line = scols_table_new_line(lsmem->table, NULL); + if (!line) + err_oom(); + + for (i = 0; i < ncolumns; i++) { + char *str = NULL; + + switch (get_column_id(i)) { + case COL_RANGE: + { + uint64_t start = blk->index * lsmem->block_size; + uint64_t size = blk->count * lsmem->block_size; + xasprintf(&str, "0x%016"PRIx64"-0x%016"PRIx64, start, start + size - 1); + break; + } + case COL_SIZE: + if (lsmem->bytes) + xasprintf(&str, "%"PRId64, (uint64_t) blk->count * lsmem->block_size); + else + str = size_to_human_string(SIZE_SUFFIX_1LETTER, + (uint64_t) blk->count * lsmem->block_size); + break; + case COL_STATE: + str = xstrdup( + blk->state == MEMORY_STATE_ONLINE ? _("online") : + blk->state == MEMORY_STATE_OFFLINE ? _("offline") : + blk->state == MEMORY_STATE_GOING_OFFLINE ? _("on->off") : + "?"); + break; + case COL_REMOVABLE: + if (blk->state == MEMORY_STATE_ONLINE) + str = xstrdup(blk->removable ? _("yes") : _("no")); + break; + case COL_BLOCK: + if (blk->count == 1) + xasprintf(&str, "%"PRId64, blk->index); + else + xasprintf(&str, "%"PRId64"-%"PRId64, + blk->index, blk->index + blk->count - 1); + break; + case COL_NODE: + if (lsmem->have_nodes) + xasprintf(&str, "%d", blk->node); + break; + case COL_ZONES: + if (lsmem->have_zones) { + char valid_zones[BUFSIZ]; + int j, zone_id; + + valid_zones[0] = '\0'; + for (j = 0; j < blk->nr_zones; j++) { + zone_id = blk->zones[j]; + if (strlen(valid_zones) + + strlen(zone_names[zone_id]) > BUFSIZ - 2) + break; + strcat(valid_zones, zone_names[zone_id]); + if (j + 1 < blk->nr_zones) + strcat(valid_zones, "/"); + } + str = xstrdup(valid_zones); + } + break; + } + + if (str && scols_line_refer_data(line, i, str) != 0) + err_oom(); + } +} + +static void fill_scols_table(struct lsmem *lsmem) +{ + int i; + + for (i = 0; i < lsmem->nblocks; i++) + add_scols_line(lsmem, &lsmem->blocks[i]); +} + +static void print_summary(struct lsmem *lsmem) +{ + if (lsmem->bytes) { + printf("%-23s %15"PRId64"\n",_("Memory block size:"), lsmem->block_size); + printf("%-23s %15"PRId64"\n",_("Total online memory:"), lsmem->mem_online); + printf("%-23s %15"PRId64"\n",_("Total offline memory:"), lsmem->mem_offline); + } else { + char *p; + + if ((p = size_to_human_string(SIZE_SUFFIX_1LETTER, lsmem->block_size))) + printf("%-23s %5s\n",_("Memory block size:"), p); + free(p); + + if ((p = size_to_human_string(SIZE_SUFFIX_1LETTER, lsmem->mem_online))) + printf("%-23s %5s\n",_("Total online memory:"), p); + free(p); + + if ((p = size_to_human_string(SIZE_SUFFIX_1LETTER, lsmem->mem_offline))) + printf("%-23s %5s\n",_("Total offline memory:"), p); + free(p); + } +} + +static int memory_block_get_node(struct lsmem *lsmem, char *name) +{ + struct dirent *de; + DIR *dir; + int node; + + dir = ul_path_opendir(lsmem->sysmem, name); + if (!dir) + err(EXIT_FAILURE, _("Failed to open %s"), name); + + node = -1; + while ((de = readdir(dir)) != NULL) { + if (strncmp("node", de->d_name, 4)) + continue; + if (!isdigit_string(de->d_name + 4)) + continue; + node = strtol(de->d_name + 4, NULL, 10); + break; + } + closedir(dir); + return node; +} + +static void memory_block_read_attrs(struct lsmem *lsmem, char *name, + struct memory_block *blk) +{ + char *line = NULL; + int i, x = 0; + + memset(blk, 0, sizeof(*blk)); + + blk->count = 1; + blk->state = MEMORY_STATE_UNKNOWN; + blk->index = strtoumax(name + 6, NULL, 10); /* get <num> of "memory<num>" */ + + if (ul_path_readf_s32(lsmem->sysmem, &x, "%s/removable", name) == 0) + blk->removable = x == 1; + + if (ul_path_readf_string(lsmem->sysmem, &line, "%s/state", name) > 0) { + if (strcmp(line, "offline") == 0) + blk->state = MEMORY_STATE_OFFLINE; + else if (strcmp(line, "online") == 0) + blk->state = MEMORY_STATE_ONLINE; + else if (strcmp(line, "going-offline") == 0) + blk->state = MEMORY_STATE_GOING_OFFLINE; + free(line); + } + + if (lsmem->have_nodes) + blk->node = memory_block_get_node(lsmem, name); + + blk->nr_zones = 0; + if (lsmem->have_zones && + ul_path_readf_string(lsmem->sysmem, &line, "%s/valid_zones", name) > 0) { + + char *token = strtok(line, " "); + + for (i = 0; token && i < MAX_NR_ZONES; i++) { + blk->zones[i] = zone_name_to_id(token); + blk->nr_zones++; + token = strtok(NULL, " "); + } + + free(line); + } +} + +static int is_mergeable(struct lsmem *lsmem, struct memory_block *blk) +{ + struct memory_block *curr; + int i; + + if (!lsmem->nblocks) + return 0; + curr = &lsmem->blocks[lsmem->nblocks - 1]; + if (lsmem->list_all) + return 0; + if (curr->index + curr->count != blk->index) + return 0; + if (lsmem->split_by_state && curr->state != blk->state) + return 0; + if (lsmem->split_by_removable && curr->removable != blk->removable) + return 0; + if (lsmem->split_by_node && lsmem->have_nodes) { + if (curr->node != blk->node) + return 0; + } + if (lsmem->split_by_zones && lsmem->have_zones) { + if (curr->nr_zones != blk->nr_zones) + return 0; + for (i = 0; i < curr->nr_zones; i++) { + if (curr->zones[i] == ZONE_UNKNOWN || + curr->zones[i] != blk->zones[i]) + return 0; + } + } + return 1; +} + +static void read_info(struct lsmem *lsmem) +{ + struct memory_block blk; + char buf[128]; + int i; + + if (ul_path_read_buffer(lsmem->sysmem, buf, sizeof(buf), "block_size_bytes") <= 0) + err(EXIT_FAILURE, _("failed to read memory block size")); + lsmem->block_size = strtoumax(buf, NULL, 16); + + for (i = 0; i < lsmem->ndirs; i++) { + memory_block_read_attrs(lsmem, lsmem->dirs[i]->d_name, &blk); + if (blk.state == MEMORY_STATE_ONLINE) + lsmem->mem_online += lsmem->block_size; + else + lsmem->mem_offline += lsmem->block_size; + if (is_mergeable(lsmem, &blk)) { + lsmem->blocks[lsmem->nblocks - 1].count++; + continue; + } + lsmem->nblocks++; + lsmem->blocks = xrealloc(lsmem->blocks, lsmem->nblocks * sizeof(blk)); + *&lsmem->blocks[lsmem->nblocks - 1] = blk; + } +} + +static int memory_block_filter(const struct dirent *de) +{ + if (strncmp("memory", de->d_name, 6)) + return 0; + return isdigit_string(de->d_name + 6); +} + +static void read_basic_info(struct lsmem *lsmem) +{ + char dir[PATH_MAX]; + + if (ul_path_access(lsmem->sysmem, F_OK, "block_size_bytes") != 0) + errx(EXIT_FAILURE, _("This system does not support memory blocks")); + + ul_path_get_abspath(lsmem->sysmem, dir, sizeof(dir), NULL); + + lsmem->ndirs = scandir(dir, &lsmem->dirs, memory_block_filter, versionsort); + if (lsmem->ndirs <= 0) + err(EXIT_FAILURE, _("Failed to read %s"), dir); + + if (memory_block_get_node(lsmem, lsmem->dirs[0]->d_name) != -1) + lsmem->have_nodes = 1; + + /* The valid_zones sysmem attribute was introduced with kernel 3.18 */ + if (ul_path_access(lsmem->sysmem, F_OK, "memory0/valid_zones") == 0) + lsmem->have_zones = 1; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("List the ranges of available memory with their online status.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -J, --json use JSON output format\n"), out); + fputs(_(" -P, --pairs use key=\"value\" output format\n"), out); + fputs(_(" -a, --all list each individual memory block\n"), out); + fputs(_(" -b, --bytes print SIZE in bytes rather than in human readable format\n"), out); + fputs(_(" -n, --noheadings don't print headings\n"), out); + fputs(_(" -o, --output <list> output columns\n"), out); + fputs(_(" --output-all output all columns\n"), out); + fputs(_(" -r, --raw use raw output format\n"), out); + fputs(_(" -S, --split <list> split ranges by specified columns\n"), out); + fputs(_(" -s, --sysroot <dir> use the specified directory as system root\n"), out); + fputs(_(" --summary[=when] print summary information (never,always or only)\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(22)); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(coldescs); i++) + fprintf(out, " %10s %s\n", coldescs[i].name, _(coldescs[i].help)); + + printf(USAGE_MAN_TAIL("lsmem(1)")); + + exit(out == stderr ? EXIT_FAILURE : EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + struct lsmem _lsmem = { + .want_table = 1, + .want_summary = 1 + }, *lsmem = &_lsmem; + + const char *outarg = NULL, *splitarg = NULL, *prefix = NULL; + int c; + size_t i; + + enum { + LSMEM_OPT_SUMARRY = CHAR_MAX + 1, + OPT_OUTPUT_ALL + }; + + static const struct option longopts[] = { + {"all", no_argument, NULL, 'a'}, + {"bytes", no_argument, NULL, 'b'}, + {"help", no_argument, NULL, 'h'}, + {"json", no_argument, NULL, 'J'}, + {"noheadings", no_argument, NULL, 'n'}, + {"output", required_argument, NULL, 'o'}, + {"output-all", no_argument, NULL, OPT_OUTPUT_ALL}, + {"pairs", no_argument, NULL, 'P'}, + {"raw", no_argument, NULL, 'r'}, + {"sysroot", required_argument, NULL, 's'}, + {"split", required_argument, NULL, 'S'}, + {"version", no_argument, NULL, 'V'}, + {"summary", optional_argument, NULL, LSMEM_OPT_SUMARRY }, + {NULL, 0, NULL, 0} + }; + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'J', 'P', 'r' }, + { 'S', 'a' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "abhJno:PrS:s:V", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'a': + lsmem->list_all = 1; + break; + case 'b': + lsmem->bytes = 1; + break; + case 'h': + usage(); + break; + case 'J': + lsmem->json = 1; + lsmem->want_summary = 0; + break; + case 'n': + lsmem->noheadings = 1; + break; + case 'o': + outarg = optarg; + break; + case OPT_OUTPUT_ALL: + for (ncolumns = 0; (size_t)ncolumns < ARRAY_SIZE(coldescs); ncolumns++) + columns[ncolumns] = ncolumns; + break; + case 'P': + lsmem->export = 1; + lsmem->want_summary = 0; + break; + case 'r': + lsmem->raw = 1; + lsmem->want_summary = 0; + break; + case 's': + prefix = optarg; + break; + case 'S': + splitarg = optarg; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return 0; + case LSMEM_OPT_SUMARRY: + if (optarg) { + if (strcmp(optarg, "never") == 0) + lsmem->want_summary = 0; + else if (strcmp(optarg, "only") == 0) + lsmem->want_table = 0; + else if (strcmp(optarg, "always") == 0) + lsmem->want_summary = 1; + else + errx(EXIT_FAILURE, _("unsupported --summary argument")); + } else + lsmem->want_table = 0; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (argc != optind) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + if (lsmem->want_table + lsmem->want_summary == 0) + errx(EXIT_FAILURE, _("options --{raw,json,pairs} and --summary=only are mutually exclusive")); + + ul_path_init_debug(); + + lsmem->sysmem = ul_new_path(_PATH_SYS_MEMORY); + if (!lsmem->sysmem) + err(EXIT_FAILURE, _("failed to initialize %s handler"), _PATH_SYS_MEMORY); + if (prefix && ul_path_set_prefix(lsmem->sysmem, prefix) != 0) + err(EXIT_FAILURE, _("invalid argument to --sysroot")); + + /* Shortcut to avoid scols machinery on --summary=only */ + if (lsmem->want_table == 0 && lsmem->want_summary) { + read_basic_info(lsmem); + read_info(lsmem); + print_summary(lsmem); + return EXIT_SUCCESS; + } + + /* + * Default columns + */ + if (!ncolumns) { + add_column(columns, ncolumns++, COL_RANGE); + add_column(columns, ncolumns++, COL_SIZE); + add_column(columns, ncolumns++, COL_STATE); + add_column(columns, ncolumns++, COL_REMOVABLE); + add_column(columns, ncolumns++, COL_BLOCK); + } + + if (outarg && string_add_to_idarray(outarg, columns, ARRAY_SIZE(columns), + &ncolumns, column_name_to_id) < 0) + return EXIT_FAILURE; + + /* + * Initialize output + */ + scols_init_debug(0); + + if (!(lsmem->table = scols_new_table())) + errx(EXIT_FAILURE, _("failed to initialize output table")); + scols_table_enable_raw(lsmem->table, lsmem->raw); + scols_table_enable_export(lsmem->table, lsmem->export); + scols_table_enable_json(lsmem->table, lsmem->json); + scols_table_enable_noheadings(lsmem->table, lsmem->noheadings); + + if (lsmem->json) + scols_table_set_name(lsmem->table, "memory"); + + for (i = 0; i < ncolumns; i++) { + struct coldesc *ci = get_column_desc(i); + struct libscols_column *cl; + + cl = scols_table_new_column(lsmem->table, ci->name, ci->whint, ci->flags); + if (!cl) + err(EXIT_FAILURE, _("Failed to initialize output column")); + + if (lsmem->json) { + int id = get_column_id(i); + + switch (id) { + case COL_SIZE: + if (!lsmem->bytes) + break; + /* fallthrough */ + case COL_NODE: + scols_column_set_json_type(cl, SCOLS_JSON_NUMBER); + break; + case COL_REMOVABLE: + scols_column_set_json_type(cl, SCOLS_JSON_BOOLEAN); + break; + } + } + } + + if (splitarg) { + int split[ARRAY_SIZE(coldescs)] = { 0 }; + static size_t nsplits = 0; + + if (strcasecmp(splitarg, "none") == 0) + ; + else if (string_add_to_idarray(splitarg, split, ARRAY_SIZE(split), + &nsplits, column_name_to_id) < 0) + return EXIT_FAILURE; + + set_split_policy(lsmem, split, nsplits); + + } else + /* follow output columns */ + set_split_policy(lsmem, columns, ncolumns); + + /* + * Read data and print output + */ + read_basic_info(lsmem); + read_info(lsmem); + + if (lsmem->want_table) { + fill_scols_table(lsmem); + scols_print_table(lsmem->table); + + if (lsmem->want_summary) + fputc('\n', stdout); + } + + if (lsmem->want_summary) + print_summary(lsmem); + + scols_unref_table(lsmem->table); + ul_unref_path(lsmem->sysmem); + return 0; +} diff --git a/sys-utils/lsns.8 b/sys-utils/lsns.8 new file mode 100644 index 0000000..aba3726 --- /dev/null +++ b/sys-utils/lsns.8 @@ -0,0 +1,93 @@ +.\" Man page for the lsns command. +.\" Copyright 2015 Karel Zak <kzak@redhat.com> +.\" May be distributed under the GNU General Public License + +.TH LSNS 8 "December 2015" "util-linux" "System Administration" +.SH NAME +lsns \- list namespaces +.SH SYNOPSIS +.B lsns +[options] +.RI [ namespace ] + +.SH DESCRIPTION +.B lsns +lists information about all the currently accessible namespaces or about the +given \fInamespace\fP. The \fInamespace\fP identifier is an inode number. + +The default output is subject to change. So whenever possible, you should +avoid using default outputs in your scripts. Always explicitly define expected +columns by using the \fB\-\-output\fR option together with a columns list in +environments where a stable output is required. + +\fBNSFS\fP column, printed when \fBnet\fP is specified for +\fB\-\-type\fR option, is special; it uses multi-line cells. +Use the option \fB\-\-nowrap\fR is for switching to "," separated single-line +representation. + +Note that \fBlsns\fR reads information directly from the /proc filesystem and +for non-root users it may return incomplete information. The current /proc +filesystem may be unshared and affected by a PID namespace +(see \fBunshare \-\-mount\-proc\fP for more details). +.B lsns +is not able to see persistent namespaces without processes where the namespace +instance is held by a bind mount to /proc/\fIpid\fR/ns/\fItype\fR. + +.SH OPTIONS +.TP +.BR \-J , " \-\-json" +Use JSON output format. +.TP +.BR \-l , " \-\-list" +Use list output format. +.TP +.BR \-n , " \-\-noheadings" +Do not print a header line. +.TP +.BR \-o , " \-\-output " \fIlist\fP +Specify which output columns to print. Use \fB\-\-help\fR +to get a list of all supported columns. + +The default list of columns may be extended if \fIlist\fP is +specified in the format \fB+\fIlist\fP (e.g. \fBlsns \-o +PATH\fP). +.TP +.B \-\-output\-all +Output all available columns. +.TP +.BR \-p , " \-\-task " \fIpid\fP +Display only the namespaces held by the process with this \fIpid\fR. +.TP +.BR \-r , " \-\-raw" +Use the raw output format. +.TP +.BR \-t , " \-\-type " \fItype\fP +Display the specified \fItype\fP of namespaces only. The supported types are +\fBmnt\fP, \fBnet\fP, \fBipc\fP, \fBuser\fP, \fBpid\fP, \fButs\fP and +\fBcgroup\fP. This option may be given more than once. +.TP +.BR \-u , " \-\-notruncate" +Do not truncate text in columns. +.TP +.BR \-W , " \-\-nowrap" +Do not use multi-line text in columns. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH AUTHORS +.nf +Karel Zak <kzak@redhat.com> +.fi + +.SH "SEE ALSO" +.BR nsenter (1), +.BR unshare (1), +.BR clone (2), +.BR namespaces (7) + +.SH AVAILABILITY +The lsns command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/lsns.c b/sys-utils/lsns.c new file mode 100644 index 0000000..38ea2e0 --- /dev/null +++ b/sys-utils/lsns.c @@ -0,0 +1,1100 @@ +/* + * lsns(8) - list system namespaces + * + * Copyright (C) 2015 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <string.h> +#include <getopt.h> +#include <stdlib.h> +#include <assert.h> +#include <dirent.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <wchar.h> +#include <libsmartcols.h> +#include <libmount.h> + +#ifdef HAVE_LINUX_NET_NAMESPACE_H +#include <stdbool.h> +#include <sys/socket.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/net_namespace.h> +#endif + +#include "pathnames.h" +#include "nls.h" +#include "xalloc.h" +#include "c.h" +#include "list.h" +#include "closestream.h" +#include "optutils.h" +#include "procutils.h" +#include "strutils.h" +#include "namespace.h" +#include "idcache.h" + +#include "debug.h" + +static UL_DEBUG_DEFINE_MASK(lsns); +UL_DEBUG_DEFINE_MASKNAMES(lsns) = UL_DEBUG_EMPTY_MASKNAMES; + +#define LSNS_DEBUG_INIT (1 << 1) +#define LSNS_DEBUG_PROC (1 << 2) +#define LSNS_DEBUG_NS (1 << 3) +#define LSNS_DEBUG_ALL 0xFFFF + +#define LSNS_NETNS_UNUSABLE -2 + +#define DBG(m, x) __UL_DBG(lsns, LSNS_DEBUG_, m, x) +#define ON_DBG(m, x) __UL_DBG_CALL(lsns, LSNS_DEBUG_, m, x) + +#define UL_DEBUG_CURRENT_MASK UL_DEBUG_MASK(lsns) +#include "debugobj.h" + +static struct idcache *uid_cache = NULL; + +/* column IDs */ +enum { + COL_NS = 0, + COL_TYPE, + COL_PATH, + COL_NPROCS, + COL_PID, + COL_PPID, + COL_COMMAND, + COL_UID, + COL_USER, + COL_NETNSID, + COL_NSFS, +}; + +/* column names */ +struct colinfo { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; + int json_type; +}; + +/* columns descriptions */ +static const struct colinfo infos[] = { + [COL_NS] = { "NS", 10, SCOLS_FL_RIGHT, N_("namespace identifier (inode number)"), SCOLS_JSON_NUMBER }, + [COL_TYPE] = { "TYPE", 5, 0, N_("kind of namespace") }, + [COL_PATH] = { "PATH", 0, 0, N_("path to the namespace")}, + [COL_NPROCS] = { "NPROCS", 5, SCOLS_FL_RIGHT, N_("number of processes in the namespace"), SCOLS_JSON_NUMBER }, + [COL_PID] = { "PID", 5, SCOLS_FL_RIGHT, N_("lowest PID in the namespace"), SCOLS_JSON_NUMBER }, + [COL_PPID] = { "PPID", 5, SCOLS_FL_RIGHT, N_("PPID of the PID"), SCOLS_JSON_NUMBER }, + [COL_COMMAND] = { "COMMAND", 0, SCOLS_FL_TRUNC, N_("command line of the PID")}, + [COL_UID] = { "UID", 0, SCOLS_FL_RIGHT, N_("UID of the PID"), SCOLS_JSON_NUMBER}, + [COL_USER] = { "USER", 0, 0, N_("username of the PID")}, + [COL_NETNSID] = { "NETNSID", 0, SCOLS_FL_RIGHT, N_("namespace ID as used by network subsystem")}, + [COL_NSFS] = { "NSFS", 0, SCOLS_FL_WRAP, N_("nsfs mountpoint (usually used network subsystem)")} +}; + +static int columns[ARRAY_SIZE(infos) * 2]; +static size_t ncolumns; + +enum { + LSNS_ID_MNT = 0, + LSNS_ID_NET, + LSNS_ID_PID, + LSNS_ID_UTS, + LSNS_ID_IPC, + LSNS_ID_USER, + LSNS_ID_CGROUP +}; + +static char *ns_names[] = { + [LSNS_ID_MNT] = "mnt", + [LSNS_ID_NET] = "net", + [LSNS_ID_PID] = "pid", + [LSNS_ID_UTS] = "uts", + [LSNS_ID_IPC] = "ipc", + [LSNS_ID_USER] = "user", + [LSNS_ID_CGROUP] = "cgroup" +}; + +struct lsns_namespace { + ino_t id; + int type; /* LSNS_* */ + int nprocs; + int netnsid; + + struct lsns_process *proc; + + struct list_head namespaces; /* lsns->processes member */ + struct list_head processes; /* head of lsns_process *siblings */ +}; + +struct lsns_process { + pid_t pid; /* process PID */ + pid_t ppid; /* parent's PID */ + pid_t tpid; /* thread group */ + char state; + uid_t uid; + + ino_t ns_ids[ARRAY_SIZE(ns_names)]; + struct list_head ns_siblings[ARRAY_SIZE(ns_names)]; + + struct list_head processes; /* list of processes */ + + struct libscols_line *outline; + struct lsns_process *parent; + + int netnsid; +}; + +struct lsns { + struct list_head processes; + struct list_head namespaces; + + pid_t fltr_pid; /* filter out by PID */ + ino_t fltr_ns; /* filter out by namespace */ + int fltr_types[ARRAY_SIZE(ns_names)]; + int fltr_ntypes; + + unsigned int raw : 1, + json : 1, + tree : 1, + list : 1, + no_trunc : 1, + no_headings: 1, + no_wrap : 1; + + struct libmnt_table *tab; +}; + +struct netnsid_cache { + ino_t ino; + int id; + struct list_head netnsids; +}; + +static struct list_head netnsids_cache; + +static int netlink_fd = -1; + +static void lsns_init_debug(void) +{ + __UL_INIT_DEBUG_FROM_ENV(lsns, LSNS_DEBUG_, 0, LSNS_DEBUG); +} + +static int ns_name2type(const char *name) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(ns_names); i++) { + if (strcmp(ns_names[i], name) == 0) + return i; + } + return -1; +} + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + assert(name); + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static int has_column(int id) +{ + size_t i; + + for (i = 0; i < ncolumns; i++) { + if (columns[i] == id) + return 1; + } + return 0; +} + +static inline int get_column_id(int num) +{ + assert(num >= 0); + assert((size_t) num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(infos)); + + return columns[num]; +} + +static inline const struct colinfo *get_column_info(unsigned num) +{ + return &infos[ get_column_id(num) ]; +} + +static int get_ns_ino(int dir, const char *nsname, ino_t *ino) +{ + struct stat st; + char path[16]; + + snprintf(path, sizeof(path), "ns/%s", nsname); + + if (fstatat(dir, path, &st, 0) != 0) + return -errno; + *ino = st.st_ino; + return 0; +} + +static int parse_proc_stat(FILE *fp, pid_t *pid, char *state, pid_t *ppid) +{ + char *line = NULL, *p; + size_t len = 0; + int rc; + + if (getline(&line, &len, fp) < 0) { + rc = -errno; + goto error; + } + + p = strrchr(line, ')'); + if (p == NULL || + sscanf(line, "%d (", pid) != 1 || + sscanf(p, ") %c %d*[^\n]", state, ppid) != 2) { + rc = -EINVAL; + goto error; + } + rc = 0; + +error: + free(line); + return rc; +} + +#ifdef HAVE_LINUX_NET_NAMESPACE_H +static int netnsid_cache_find(ino_t netino, int *netnsid) +{ + struct list_head *p; + + list_for_each(p, &netnsids_cache) { + struct netnsid_cache *e = list_entry(p, + struct netnsid_cache, + netnsids); + if (e->ino == netino) { + *netnsid = e->id; + return 1; + } + } + + return 0; +} + +static void netnsid_cache_add(ino_t netino, int netnsid) +{ + struct netnsid_cache *e; + + e = xcalloc(1, sizeof(*e)); + e->ino = netino; + e->id = netnsid; + INIT_LIST_HEAD(&e->netnsids); + list_add(&e->netnsids, &netnsids_cache); +} + +static int get_netnsid_via_netlink_send_request(int target_fd) +{ + unsigned char req[NLMSG_SPACE(sizeof(struct rtgenmsg)) + + RTA_SPACE(sizeof(int32_t))]; + + struct nlmsghdr *nlh = (struct nlmsghdr *)req; + struct rtgenmsg *rt = NLMSG_DATA(req); + struct rtattr *rta = (struct rtattr *) + (req + NLMSG_SPACE(sizeof(struct rtgenmsg))); + int32_t *fd = RTA_DATA(rta); + + nlh->nlmsg_len = sizeof(req); + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_type = RTM_GETNSID; + rt->rtgen_family = AF_UNSPEC; + rta->rta_type = NETNSA_FD; + rta->rta_len = RTA_SPACE(sizeof(int32_t)); + *fd = target_fd; + + if (send(netlink_fd, req, sizeof(req), 0) < 0) + return -1; + return 0; +} + +static int get_netnsid_via_netlink_recv_response(int *netnsid) +{ + unsigned char res[NLMSG_SPACE(sizeof(struct rtgenmsg)) + + ((RTA_SPACE(sizeof(int32_t)) + < RTA_SPACE(sizeof(struct nlmsgerr))) + ? RTA_SPACE(sizeof(struct nlmsgerr)) + : RTA_SPACE(sizeof(int32_t)))]; + int rtalen; + ssize_t reslen; + + struct nlmsghdr *nlh; + struct rtattr *rta; + + reslen = recv(netlink_fd, res, sizeof(res), 0); + if (reslen < 0) + return -1; + + nlh = (struct nlmsghdr *)res; + if (!(NLMSG_OK(nlh, (size_t)reslen) + && nlh->nlmsg_type == RTM_NEWNSID)) + return -1; + + rtalen = NLMSG_PAYLOAD(nlh, sizeof(struct rtgenmsg)); + rta = (struct rtattr *)(res + NLMSG_SPACE(sizeof(struct rtgenmsg))); + if (!(RTA_OK(rta, rtalen) + && rta->rta_type == NETNSA_NSID)) + return -1; + + *netnsid = *(int *)RTA_DATA(rta); + + return 0; +} + +static int get_netnsid_via_netlink(int dir, const char *path) +{ + int netnsid; + int target_fd; + + if (netlink_fd < 0) + return LSNS_NETNS_UNUSABLE; + + target_fd = openat(dir, path, O_RDONLY); + if (target_fd < 0) + return LSNS_NETNS_UNUSABLE; + + if (get_netnsid_via_netlink_send_request(target_fd) < 0) { + netnsid = LSNS_NETNS_UNUSABLE; + goto out; + } + + if (get_netnsid_via_netlink_recv_response(&netnsid) < 0) { + netnsid = LSNS_NETNS_UNUSABLE; + goto out; + } + + out: + close(target_fd); + return netnsid; +} + +static int get_netnsid(int dir, ino_t netino) +{ + int netnsid; + + if (!netnsid_cache_find(netino, &netnsid)) { + netnsid = get_netnsid_via_netlink(dir, "ns/net"); + netnsid_cache_add(netino, netnsid); + } + + return netnsid; +} +#else +static int get_netnsid(int dir __attribute__((__unused__)), + ino_t netino __attribute__((__unused__))) +{ + return LSNS_NETNS_UNUSABLE; +} +#endif /* HAVE_LINUX_NET_NAMESPACE_H */ + +static int read_process(struct lsns *ls, pid_t pid) +{ + struct lsns_process *p = NULL; + char buf[BUFSIZ]; + DIR *dir; + int rc = 0, fd; + FILE *f = NULL; + size_t i; + struct stat st; + + DBG(PROC, ul_debug("reading %d", (int) pid)); + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + dir = opendir(buf); + if (!dir) + return -errno; + + p = xcalloc(1, sizeof(*p)); + p->netnsid = LSNS_NETNS_UNUSABLE; + + if (fstat(dirfd(dir), &st) == 0) { + p->uid = st.st_uid; + add_uid(uid_cache, st.st_uid); + } + + fd = openat(dirfd(dir), "stat", O_RDONLY); + if (fd < 0) { + rc = -errno; + goto done; + } + if (!(f = fdopen(fd, "r"))) { + rc = -errno; + goto done; + } + rc = parse_proc_stat(f, &p->pid, &p->state, &p->ppid); + if (rc < 0) + goto done; + rc = 0; + + for (i = 0; i < ARRAY_SIZE(p->ns_ids); i++) { + INIT_LIST_HEAD(&p->ns_siblings[i]); + + if (!ls->fltr_types[i]) + continue; + + rc = get_ns_ino(dirfd(dir), ns_names[i], &p->ns_ids[i]); + if (rc && rc != -EACCES && rc != -ENOENT) + goto done; + if (i == LSNS_ID_NET) + p->netnsid = get_netnsid(dirfd(dir), p->ns_ids[i]); + rc = 0; + } + + INIT_LIST_HEAD(&p->processes); + + DBG(PROC, ul_debugobj(p, "new pid=%d", p->pid)); + list_add_tail(&p->processes, &ls->processes); +done: + if (f) + fclose(f); + closedir(dir); + if (rc) + free(p); + return rc; +} + +static int read_processes(struct lsns *ls) +{ + struct proc_processes *proc = NULL; + pid_t pid; + int rc = 0; + + DBG(PROC, ul_debug("opening /proc")); + + if (!(proc = proc_open_processes())) { + rc = -errno; + goto done; + } + + while (proc_next_pid(proc, &pid) == 0) { + rc = read_process(ls, pid); + if (rc && rc != -EACCES && rc != -ENOENT) + break; + rc = 0; + } +done: + DBG(PROC, ul_debug("closing /proc")); + proc_close_processes(proc); + return rc; +} + +static struct lsns_namespace *get_namespace(struct lsns *ls, ino_t ino) +{ + struct list_head *p; + + list_for_each(p, &ls->namespaces) { + struct lsns_namespace *ns = list_entry(p, struct lsns_namespace, namespaces); + + if (ns->id == ino) + return ns; + } + return NULL; +} + +static int namespace_has_process(struct lsns_namespace *ns, pid_t pid) +{ + struct list_head *p; + + list_for_each(p, &ns->processes) { + struct lsns_process *proc = list_entry(p, struct lsns_process, ns_siblings[ns->type]); + + if (proc->pid == pid) + return 1; + } + return 0; +} + +static struct lsns_namespace *add_namespace(struct lsns *ls, int type, ino_t ino) +{ + struct lsns_namespace *ns = xcalloc(1, sizeof(*ns)); + + if (!ns) + return NULL; + + DBG(NS, ul_debugobj(ns, "new %s[%ju]", ns_names[type], (uintmax_t)ino)); + + INIT_LIST_HEAD(&ns->processes); + INIT_LIST_HEAD(&ns->namespaces); + + ns->type = type; + ns->id = ino; + + list_add_tail(&ns->namespaces, &ls->namespaces); + return ns; +} + +static int add_process_to_namespace(struct lsns *ls, struct lsns_namespace *ns, struct lsns_process *proc) +{ + struct list_head *p; + + DBG(NS, ul_debugobj(ns, "add process [%p] pid=%d to %s[%ju]", + proc, proc->pid, ns_names[ns->type], (uintmax_t)ns->id)); + + list_for_each(p, &ls->processes) { + struct lsns_process *xproc = list_entry(p, struct lsns_process, processes); + + if (xproc->pid == proc->ppid) /* my parent */ + proc->parent = xproc; + else if (xproc->ppid == proc->pid) /* my child */ + xproc->parent = proc; + } + + list_add_tail(&proc->ns_siblings[ns->type], &ns->processes); + ns->nprocs++; + + if (!ns->proc || ns->proc->pid > proc->pid) + ns->proc = proc; + + return 0; +} + +static int cmp_namespaces(struct list_head *a, struct list_head *b, + __attribute__((__unused__)) void *data) +{ + struct lsns_namespace *xa = list_entry(a, struct lsns_namespace, namespaces), + *xb = list_entry(b, struct lsns_namespace, namespaces); + + return cmp_numbers(xa->id, xb->id); +} + +static int netnsid_xasputs(char **str, int netnsid) +{ + if (netnsid >= 0) + return xasprintf(str, "%d", netnsid); +#ifdef NETNSA_NSID_NOT_ASSIGNED + else if (netnsid == NETNSA_NSID_NOT_ASSIGNED) + return xasprintf(str, "%s", "unassigned"); +#endif + else + return 0; +} + +static int read_namespaces(struct lsns *ls) +{ + struct list_head *p; + + DBG(NS, ul_debug("reading namespace")); + + list_for_each(p, &ls->processes) { + size_t i; + struct lsns_namespace *ns; + struct lsns_process *proc = list_entry(p, struct lsns_process, processes); + + for (i = 0; i < ARRAY_SIZE(proc->ns_ids); i++) { + if (proc->ns_ids[i] == 0) + continue; + if (!(ns = get_namespace(ls, proc->ns_ids[i]))) { + ns = add_namespace(ls, i, proc->ns_ids[i]); + if (!ns) + return -ENOMEM; + } + add_process_to_namespace(ls, ns, proc); + } + } + + list_sort(&ls->namespaces, cmp_namespaces, NULL); + + return 0; +} + +static int is_nsfs_root(struct libmnt_fs *fs, void *data) +{ + if (!mnt_fs_match_fstype(fs, "nsfs") || !mnt_fs_get_root(fs)) + return 0; + + return (strcmp(mnt_fs_get_root(fs), (char *)data) == 0); +} + +static int is_path_included(const char *path_set, const char *elt, + const char sep) +{ + size_t elt_len; + size_t path_set_len; + char *tmp; + + + tmp = strstr(path_set, elt); + if (!tmp) + return 0; + + elt_len = strlen(elt); + path_set_len = strlen(path_set); + + /* path_set includes only elt or + * path_set includes elt as the first element. + */ + if (tmp == path_set + && ((path_set_len == elt_len) + || (path_set[elt_len] == sep))) + return 1; + + /* path_set includes elt at the middle + * or as the last element. + */ + if ((*(tmp - 1) == sep) + && ((*(tmp + elt_len) == sep) + || (*(tmp + elt_len) == '\0'))) + return 1; + + return 0; +} + +static int nsfs_xasputs(char **str, + struct lsns_namespace *ns, + struct libmnt_table *tab, + char sep) +{ + struct libmnt_iter *itr = mnt_new_iter(MNT_ITER_FORWARD); + char *expected_root; + struct libmnt_fs *fs = NULL; + + xasprintf(&expected_root, "%s:[%ju]", ns_names[ns->type], (uintmax_t)ns->id); + *str = NULL; + + while (mnt_table_find_next_fs(tab, itr, is_nsfs_root, + expected_root, &fs) == 0) { + + const char *tgt = mnt_fs_get_target(fs); + + if (!*str) + xasprintf(str, "%s", tgt); + + else if (!is_path_included(*str, tgt, sep)) { + char *tmp = NULL; + + xasprintf(&tmp, "%s%c%s", *str, sep, tgt); + free(*str); + *str = tmp; + } + } + free(expected_root); + mnt_free_iter(itr); + + return 1; +} +static void add_scols_line(struct lsns *ls, struct libscols_table *table, + struct lsns_namespace *ns, struct lsns_process *proc) +{ + size_t i; + struct libscols_line *line; + + assert(ns); + assert(table); + + line = scols_table_new_line(table, + ls->tree && proc->parent ? proc->parent->outline : NULL); + if (!line) { + warn(_("failed to add line to output")); + return; + } + + for (i = 0; i < ncolumns; i++) { + char *str = NULL; + + switch (get_column_id(i)) { + case COL_NS: + xasprintf(&str, "%ju", (uintmax_t)ns->id); + break; + case COL_PID: + xasprintf(&str, "%d", (int) proc->pid); + break; + case COL_PPID: + xasprintf(&str, "%d", (int) proc->ppid); + break; + case COL_TYPE: + xasprintf(&str, "%s", ns_names[ns->type]); + break; + case COL_NPROCS: + xasprintf(&str, "%d", ns->nprocs); + break; + case COL_COMMAND: + str = proc_get_command(proc->pid); + if (!str) + str = proc_get_command_name(proc->pid); + break; + case COL_PATH: + xasprintf(&str, "/proc/%d/ns/%s", (int) proc->pid, ns_names[ns->type]); + break; + case COL_UID: + xasprintf(&str, "%d", (int) proc->uid); + break; + case COL_USER: + xasprintf(&str, "%s", get_id(uid_cache, proc->uid)->name); + break; + case COL_NETNSID: + if (ns->type == LSNS_ID_NET) + netnsid_xasputs(&str, proc->netnsid); + break; + case COL_NSFS: + nsfs_xasputs(&str, ns, ls->tab, ls->no_wrap ? ',' : '\n'); + break; + default: + break; + } + + if (str && scols_line_refer_data(line, i, str) != 0) + err_oom(); + } + + proc->outline = line; +} + +static struct libscols_table *init_scols_table(struct lsns *ls) +{ + struct libscols_table *tab; + size_t i; + + tab = scols_new_table(); + if (!tab) { + warn(_("failed to initialize output table")); + return NULL; + } + + scols_table_enable_raw(tab, ls->raw); + scols_table_enable_json(tab, ls->json); + scols_table_enable_noheadings(tab, ls->no_headings); + + if (ls->json) + scols_table_set_name(tab, "namespaces"); + + for (i = 0; i < ncolumns; i++) { + const struct colinfo *col = get_column_info(i); + int flags = col->flags; + struct libscols_column *cl; + + if (ls->no_trunc) + flags &= ~SCOLS_FL_TRUNC; + if (ls->tree && get_column_id(i) == COL_COMMAND) + flags |= SCOLS_FL_TREE; + if (ls->no_wrap) + flags &= ~SCOLS_FL_WRAP; + + cl = scols_table_new_column(tab, col->name, col->whint, flags); + if (cl == NULL) { + warnx(_("failed to initialize output column")); + goto err; + } + if (ls->json) + scols_column_set_json_type(cl, col->json_type); + + if (!ls->no_wrap && get_column_id(i) == COL_NSFS) { + scols_column_set_wrapfunc(cl, + scols_wrapnl_chunksize, + scols_wrapnl_nextchunk, + NULL); + scols_column_set_safechars(cl, "\n"); + } + } + + return tab; +err: + scols_unref_table(tab); + return NULL; +} + +static int show_namespaces(struct lsns *ls) +{ + struct libscols_table *tab; + struct list_head *p; + int rc = 0; + + tab = init_scols_table(ls); + if (!tab) + return -ENOMEM; + + list_for_each(p, &ls->namespaces) { + struct lsns_namespace *ns = list_entry(p, struct lsns_namespace, namespaces); + + if (ls->fltr_pid != 0 && !namespace_has_process(ns, ls->fltr_pid)) + continue; + + add_scols_line(ls, tab, ns, ns->proc); + } + + scols_print_table(tab); + scols_unref_table(tab); + return rc; +} + +static void show_process(struct lsns *ls, struct libscols_table *tab, + struct lsns_process *proc, struct lsns_namespace *ns) +{ + /* + * create a tree from parent->child relation, but only if the parent is + * within the same namespace + */ + if (ls->tree + && proc->parent + && !proc->parent->outline + && proc->parent->ns_ids[ns->type] == proc->ns_ids[ns->type]) + show_process(ls, tab, proc->parent, ns); + + add_scols_line(ls, tab, ns, proc); +} + + +static int show_namespace_processes(struct lsns *ls, struct lsns_namespace *ns) +{ + struct libscols_table *tab; + struct list_head *p; + + tab = init_scols_table(ls); + if (!tab) + return -ENOMEM; + + list_for_each(p, &ns->processes) { + struct lsns_process *proc = list_entry(p, struct lsns_process, ns_siblings[ns->type]); + + if (!proc->outline) + show_process(ls, tab, proc, ns); + } + + + scols_print_table(tab); + scols_unref_table(tab); + return 0; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + + fprintf(out, + _(" %s [options] [<namespace>]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("List system namespaces.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -J, --json use JSON output format\n"), out); + fputs(_(" -l, --list use list format output\n"), out); + fputs(_(" -n, --noheadings don't print headings\n"), out); + fputs(_(" -o, --output <list> define which output columns to use\n"), out); + fputs(_(" --output-all output all columns\n"), out); + fputs(_(" -p, --task <pid> print process namespaces\n"), out); + fputs(_(" -r, --raw use the raw output format\n"), out); + fputs(_(" -u, --notruncate don't truncate text in columns\n"), out); + fputs(_(" -W, --nowrap don't use multi-line representation\n"), out); + fputs(_(" -t, --type <name> namespace type (mnt, net, ipc, user, pid, uts, cgroup)\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %11s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("lsns(8)")); + + exit(EXIT_SUCCESS); +} + + +int main(int argc, char *argv[]) +{ + struct lsns ls; + int c; + int r = 0; + char *outarg = NULL; + enum { + OPT_OUTPUT_ALL = CHAR_MAX + 1 + }; + static const struct option long_opts[] = { + { "json", no_argument, NULL, 'J' }, + { "task", required_argument, NULL, 'p' }, + { "help", no_argument, NULL, 'h' }, + { "output", required_argument, NULL, 'o' }, + { "output-all", no_argument, NULL, OPT_OUTPUT_ALL }, + { "notruncate", no_argument, NULL, 'u' }, + { "version", no_argument, NULL, 'V' }, + { "noheadings", no_argument, NULL, 'n' }, + { "nowrap", no_argument, NULL, 'W' }, + { "list", no_argument, NULL, 'l' }, + { "raw", no_argument, NULL, 'r' }, + { "type", required_argument, NULL, 't' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'J','r' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + int is_net = 0; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + lsns_init_debug(); + memset(&ls, 0, sizeof(ls)); + + INIT_LIST_HEAD(&ls.processes); + INIT_LIST_HEAD(&ls.namespaces); + INIT_LIST_HEAD(&netnsids_cache); + + while ((c = getopt_long(argc, argv, + "Jlp:o:nruhVt:W", long_opts, NULL)) != -1) { + + err_exclusive_options(c, long_opts, excl, excl_st); + + switch(c) { + case 'J': + ls.json = 1; + break; + case 'l': + ls.list = 1; + break; + case 'o': + outarg = optarg; + break; + case OPT_OUTPUT_ALL: + for (ncolumns = 0; ncolumns < ARRAY_SIZE(infos); ncolumns++) + columns[ncolumns] = ncolumns; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'p': + ls.fltr_pid = strtos32_or_err(optarg, _("invalid PID argument")); + break; + case 'h': + usage(); + case 'n': + ls.no_headings = 1; + break; + case 'r': + ls.no_wrap = ls.raw = 1; + break; + case 'u': + ls.no_trunc = 1; + break; + case 't': + { + int type = ns_name2type(optarg); + if (type < 0) + errx(EXIT_FAILURE, _("unknown namespace type: %s"), optarg); + ls.fltr_types[type] = 1; + ls.fltr_ntypes++; + if (type == LSNS_ID_NET) + is_net = 1; + break; + } + case 'W': + ls.no_wrap = 1; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (!ls.fltr_ntypes) { + size_t i; + + for (i = 0; i < ARRAY_SIZE(ns_names); i++) + ls.fltr_types[i] = 1; + } + + if (optind < argc) { + if (ls.fltr_pid) + errx(EXIT_FAILURE, _("--task is mutually exclusive with <namespace>")); + ls.fltr_ns = strtou64_or_err(argv[optind], _("invalid namespace argument")); + ls.tree = ls.list ? 0 : 1; + + if (!ncolumns) { + columns[ncolumns++] = COL_PID; + columns[ncolumns++] = COL_PPID; + columns[ncolumns++] = COL_USER; + columns[ncolumns++] = COL_COMMAND; + } + } + + if (!ncolumns) { + columns[ncolumns++] = COL_NS; + columns[ncolumns++] = COL_TYPE; + columns[ncolumns++] = COL_NPROCS; + columns[ncolumns++] = COL_PID; + columns[ncolumns++] = COL_USER; + if (is_net) { + columns[ncolumns++] = COL_NETNSID; + columns[ncolumns++] = COL_NSFS; + } + columns[ncolumns++] = COL_COMMAND; + } + + if (outarg && string_add_to_idarray(outarg, columns, ARRAY_SIZE(columns), + &ncolumns, column_name_to_id) < 0) + return EXIT_FAILURE; + + scols_init_debug(0); + + uid_cache = new_idcache(); + if (!uid_cache) + err(EXIT_FAILURE, _("failed to allocate UID cache")); + +#ifdef HAVE_LINUX_NET_NAMESPACE_H + if (has_column(COL_NETNSID)) + netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); +#endif + if (has_column(COL_NSFS)) { + ls.tab = mnt_new_table_from_file(_PATH_PROC_MOUNTINFO); + if (!ls.tab) + err(MNT_EX_FAIL, _("failed to parse %s"), _PATH_PROC_MOUNTINFO); + } + + r = read_processes(&ls); + if (!r) + r = read_namespaces(&ls); + if (!r) { + if (ls.fltr_ns) { + struct lsns_namespace *ns = get_namespace(&ls, ls.fltr_ns); + + if (!ns) + errx(EXIT_FAILURE, _("not found namespace: %ju"), (uintmax_t) ls.fltr_ns); + r = show_namespace_processes(&ls, ns); + } else + r = show_namespaces(&ls); + } + + mnt_free_table(ls.tab); + if (netlink_fd >= 0) + close(netlink_fd); + free_idcache(uid_cache); + return r == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/sys-utils/mount.8 b/sys-utils/mount.8 new file mode 100644 index 0000000..da0ac5b --- /dev/null +++ b/sys-utils/mount.8 @@ -0,0 +1,2589 @@ +.\" Copyright (c) 1996-2004 Andries Brouwer +.\" Copyright (C) 2006-2012 Karel Zak <kzak@redhat.com> +.\" +.\" This page is somewhat derived from a page that was +.\" (c) 1980, 1989, 1991 The Regents of the University of California +.\" and had been heavily modified by Rik Faith and myself. +.\" (Probably no BSD text remains.) +.\" Fragments of text were written by Werner Almesberger, Remy Card, +.\" Stephen Tweedie and Eric Youngdale. +.\" +.\" This is free documentation; you can redistribute it and/or +.\" modify it under the terms of the GNU General Public License as +.\" published by the Free Software Foundation; either version 2 of +.\" the License, or (at your option) any later version. +.\" +.\" The GNU General Public License's references to "object code" +.\" and "executables" are to be interpreted as the output of any +.\" document formatting or typesetting system, including +.\" intermediate and printed output. +.\" +.\" This manual is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License along +.\" with this program; if not, write to the Free Software Foundation, Inc., +.\" 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +.\" +.TH MOUNT 8 "August 2015" "util-linux" "System Administration" +.SH NAME +mount \- mount a filesystem +.SH SYNOPSIS +.B mount +.RB [ \-l | \-h | \-V ] +.LP +.B mount \-a +.RB [ \-fFnrsvw ] +.RB [ \-t +.IR fstype ] +.RB [ \-O +.IR optlist ] +.LP +.B mount +.RB [ \-fnrsvw ] +.RB [ \-o +.IR options ] +.IR device | dir +.LP +.B mount +.RB [ \-fnrsvw ] +.RB [ \-t +.IB fstype ] +.RB [ \-o +.IR options ] +.I device dir +.SH DESCRIPTION +All files accessible in a Unix system are arranged in one big +tree, the file hierarchy, rooted at +.IR / . +These files can be spread out over several devices. The +.B mount +command serves to attach the filesystem found on some device +to the big file tree. Conversely, the +.BR umount (8) +command will detach it again. The filesystem is used to control how data is +stored on the device or provided in a virtual way by network or another services. + +The standard form of the +.B mount +command is: +.RS + +.br +.BI "mount \-t" " type device dir" +.br + +.RE +This tells the kernel to attach the filesystem found on +.I device +(which is of type +.IR type ) +at the directory +.IR dir . +The option \fB\-t \fItype\fR is optional. The +.B mount +command is usually able to detect a filesystem. The root permissions are necessary +to mount a filesystem by default. See section "Non-superuser mounts" below for more details. +The previous contents (if any) and owner and mode of +.I dir +become invisible, and as long as this filesystem remains mounted, +the pathname +.I dir +refers to the root of the filesystem on +.IR device . + +If only the directory or the device is given, for example: +.RS +.sp +.B mount /dir +.sp +.RE +then \fBmount\fR looks for a mountpoint (and if not found then for a device) in the +.I /etc/fstab +file. It's possible to use the +.B \-\-target +or +.B \-\-source +options to avoid ambivalent interpretation of the given argument. For example: +.RS +.sp +.B mount \-\-target /mountpoint +.sp +.RE + +The same filesystem may be mounted more than once, and in some cases (e.g. +network filesystems) the same filesystem maybe be mounted on the same +mountpoint more times. The mount command does not implement any policy to +control this behavior. All behavior is controlled by kernel and it is usually +specific to filesystem driver. The exception is \fB\-\-all\fR, in this case +already mounted filesystems are ignored (see \fB\-\-all\fR below for more details). + +.SS Listing the mounts +The listing mode is maintained for backward compatibility only. + +For more robust and customizable output use +.BR findmnt (8), +\fBespecially in your scripts\fP. Note that control characters in the +mountpoint name are replaced with '?'. + +The following command lists all mounted filesystems (of type +.IR type ): +.RS +.sp +.BR "mount " [ \-l "] [" "\-t \fItype\/\fP" ] +.sp +.RE +The option \fB\-l\fR adds labels to this listing. See below. + +.SS Indicating the device and filesystem +Most devices are indicated by a filename (of a block special device), like +.IR /dev/sda1 , +but there are other possibilities. For example, in the case of an NFS mount, +.I device +may look like +.IR knuth.cwi.nl:/dir . +It is also possible to indicate a block special device using its filesystem label +or UUID (see the \fB\-L\fR and \fB\-U\fR options below), or its partition label +or UUID. Partition identifiers are supported for example for GUID Partition +Tables (GPT). + +The device name of disk partitions are unstable; hardware reconfiguration, +adding or removing a device can cause change in names. This is reason why it's +strongly recommended to use filesystem or partition identificators like UUID or +LABEL. + +The command \fBlsblk --fs\fR provides overview of filesystems, LABELs and UUIDs +on available block devices. The command \fBblkid -p <device>\fR provides details about +a filesystem on the specified device. + +Don't forget that there is no guarantee that UUIDs and labels are really +unique, especially if you move, share or copy the device. Use +.B "lsblk \-o +UUID,PARTUUID" +to verify that the UUIDs are really unique in your system. + +The recommended setup is to use tags (e.g.\& \fBUUID=\fIuuid\fR) rather than +.I /dev/disk/by-{label,uuid,partuuid,partlabel} +udev symlinks in the +.I /etc/fstab +file. Tags are +more readable, robust and portable. The +.BR mount (8) +command internally uses udev +symlinks, so the use of symlinks in /etc/fstab has no advantage over tags. +For more details see +.BR libblkid (3). + +Note that +.BR mount (8) +uses UUIDs as strings. The UUIDs from the command line or from +.BR fstab (5) +are not converted to internal binary representation. The string representation +of the UUID should be based on lower case characters. + +The +.I proc +filesystem is not associated with a special device, and when +mounting it, an arbitrary keyword, such as +.I proc +can be used instead of a device specification. +(The customary choice +.I none +is less fortunate: the error message `none already mounted' from +.B mount +can be confusing.) + +.SS The files /etc/fstab, /etc/mtab and /proc/mounts +The file +.I /etc/fstab +(see +.BR fstab (5)), +may contain lines describing what devices are usually +mounted where, using which options. The default location of the +.BR fstab (5) +file can be overridden with the +.BI \-\-fstab " path" +command-line option (see below for more details). +.LP +The command +.RS +.sp +.B mount \-a +.RB [ \-t +.IR type ] +.RB [ \-O +.IR optlist ] +.sp +.RE +(usually given in a bootscript) causes all filesystems mentioned in +.I fstab +(of the proper type and/or having or not having the proper options) +to be mounted as indicated, except for those whose line contains the +.B noauto +keyword. Adding the +.B \-F +option will make \fBmount\fR fork, so that the +filesystems are mounted simultaneously. +.LP +When mounting a filesystem mentioned in +.I fstab +or +.IR mtab , +it suffices to specify on the command line only the device, or only the mount point. +.sp +The programs +.B mount +and +.B umount +traditionally maintained a list of currently mounted filesystems in the file +.IR /etc/mtab . +The support for regular classic +.I /etc/mtab +is completely disabled in compile time by default, because on current Linux +systems it is better to make it a symlink to +.I /proc/mounts +instead. The regular mtab file maintained in userspace cannot reliably +work with namespaces, containers and other advanced Linux features. +If the regular mtab support is enabled than it's possible to +use the file as well as the symlink. +.sp +If no arguments are given to +.BR mount , +the list of mounted filesystems is printed. +.sp +If you want to override mount options from +.I /etc/fstab +you have to use the \fB\-o\fR option: +.RS +.sp +.BI mount " device" \fR| "dir " \-o " options" +.sp +.RE +and then the mount options from the command line will be appended to +the list of options from +.IR /etc/fstab . +This default behaviour is possible to change by command line +option \fB\-\-options\-mode\fR. +The usual behavior is that the last option wins if there are conflicting +ones. +.sp +The +.B mount +program does not read the +.I /etc/fstab +file if both +.I device +(or LABEL, UUID, PARTUUID or PARTLABEL) and +.I dir +are specified. For example, to mount device +.BR foo " at " /dir : +.RS +.sp +.B "mount /dev/foo /dir" +.sp +.RE +This default behaviour is possible to change by command line option +\fB\-\-options\-source\-force\fR to always read configuration from fstab. For +non-root users +.B mount +always read fstab configuration. + +.SS Non-superuser mounts +Normally, only the superuser can mount filesystems. +However, when +.I fstab +contains the +.B user +option on a line, anybody can mount the corresponding filesystem. +.LP +Thus, given a line +.RS +.sp +.B "/dev/cdrom /cd iso9660 ro,user,noauto,unhide" +.sp +.RE +any user can mount the iso9660 filesystem found on an inserted CDROM +using the command: +.RS +.B "mount /cd" +.sp +.RE +Note that \fBmount\fR is very strict about non-root users and all paths +specified on command line are verified before fstab is parsed or a helper +program is executed. It's strongly recommended to use a valid mountpoint to +specify filesystem, otherwise \fBmount\fR may fail. For example it's bad idea +to use NFS or CIFS source on command line. +.PP +For more details, see +.BR fstab (5). +Only the user that mounted a filesystem can unmount it again. +If any user should be able to unmount it, then use +.B users +instead of +.B user +in the +.I fstab +line. +The +.B owner +option is similar to the +.B user +option, with the restriction that the user must be the owner +of the special file. This may be useful e.g.\& for +.I /dev/fd +if a login script makes the console user owner of this device. +The +.B group +option is similar, with the restriction that the user must be +member of the group of the special file. + +.SS Bind mount operation +Remount part of the file hierarchy somewhere else. The call is: + +.RS +.br +.B mount \-\-bind +.I olddir newdir +.RE + +or by using this fstab entry: + +.RS +.br +.BI / olddir +.BI / newdir +.B none bind +.RE + +After this call the same contents are accessible in two places. + +It is important to understand that "bind" does not to create any second-class +or special node in the kernel VFS. The "bind" is just another operation to +attach a filesystem. There is nowhere stored information that the filesystem +has been attached by "bind" operation. The \fIolddir\fR and \fInewdir\fR are +independent and the \fIolddir\fR maybe be umounted. + +One can also remount a single file (on a single file). It's also +possible to use the bind mount to create a mountpoint from a regular +directory, for example: + +.RS +.br +.B mount \-\-bind foo foo +.RE + +The bind mount call attaches only (part of) a single filesystem, not possible +submounts. The entire file hierarchy including submounts is attached +a second place by using: + +.RS +.br +.B mount \-\-rbind +.I olddir newdir +.RE + +Note that the filesystem mount options maintained by kernel will remain the same as those +on the original mount point. The userspace mount options (e.g. _netdev) will not be copied +by +.BR mount (8) +and it's necessary explicitly specify the options on mount command line. + +.BR mount (8) +since v2.27 allows to change the mount options by passing the +relevant options along with +.BR \-\-bind . +For example: + +.RS +.br +.B mount -o bind,ro foo foo +.RE + +This feature is not supported by the Linux kernel; it is implemented in userspace +by an additional \fBmount\fR(2) remounting system call. +This solution is not atomic. + +The alternative (classic) way to create a read-only bind mount is to use the remount +operation, for example: + +.RS +.br +.B mount \-\-bind +.I olddir newdir +.br +.B mount \-o remount,bind,ro +.I olddir newdir +.RE + +Note that a read-only bind will create a read-only mountpoint (VFS entry), +but the original filesystem superblock will still be writable, meaning that the +.I olddir +will be writable, but the +.I newdir +will be read-only. + +It's also possible to change nosuid, nodev, noexec, noatime, nodiratime and +relatime VFS entry flags by "remount,bind" operation. The another (for example +filesystem specific flags) are silently ignored. It's impossible to change mount +options recursively (for example with \fB-o rbind,ro\fR). + +.BR mount (8) +since v2.31 ignores the \fBbind\fR flag from +.I /etc/fstab +on +.B remount operation +(if "-o remount" specified on command line). This is necessary to fully control +mount options on remount by command line. In the previous versions the bind +flag has been always applied and it was impossible to re-define mount options +without interaction with the bind semantic. This +.BR mount (8) +behavior does not affect situations when "remount,bind" is specified in the +.I /etc/fstab +file. +.RE + +.SS The move operation +Move a +.B mounted tree +to another place (atomically). The call is: + +.RS +.br +.B mount \-\-move +.I olddir newdir +.RE + +This will cause the contents which previously appeared under +.I olddir +to now be accessible under +.IR newdir . +The physical location of the files is not changed. +Note that +.I olddir +has to be a mountpoint. + +Note also that moving a mount residing under a shared mount is invalid and +unsupported. Use +.B findmnt \-o TARGET,PROPAGATION +to see the current propagation flags. + +.SS Shared subtree operations +Since Linux 2.6.15 it is possible to mark a mount and its submounts as shared, +private, slave or unbindable. A shared mount provides the ability to create mirrors +of that mount such that mounts and unmounts within any of the mirrors propagate +to the other mirror. A slave mount receives propagation from its master, but +not vice versa. A private mount carries no propagation abilities. An +unbindable mount is a private mount which cannot be cloned through a bind +operation. The detailed semantics are documented in +.I Documentation/filesystems/sharedsubtree.txt +file in the kernel source tree. + +Supported operations are: + +.RS +.nf +.BI "mount \-\-make\-shared " mountpoint +.BI "mount \-\-make\-slave " mountpoint +.BI "mount \-\-make\-private " mountpoint +.BI "mount \-\-make\-unbindable " mountpoint +.fi +.RE + +The following commands allow one to recursively change the type of all the +mounts under a given mountpoint. + +.RS +.nf +.BI "mount \-\-make\-rshared " mountpoint +.BI "mount \-\-make\-rslave " mountpoint +.BI "mount \-\-make\-rprivate " mountpoint +.BI "mount \-\-make\-runbindable " mountpoint +.fi +.RE + +.BR mount (8) +.B does not read +.BR fstab (5) +when a \fB\-\-make-\fR* operation is requested. All necessary information has to be +specified on the command line. + +Note that the Linux kernel does not allow to change multiple propagation flags +with a single +.BR mount (2) +system call, and the flags cannot be mixed with other mount options and operations. + +Since util-linux 2.23 the \fBmount\fR command allows to do more propagation +(topology) changes by one mount(8) call and do it also together with other +mount operations. This feature is EXPERIMENTAL. The propagation flags are applied +by additional \fBmount\fR(2) system calls when the preceding mount operations +were successful. Note that this use case is not atomic. It is possible to +specify the propagation flags in +.BR fstab (5) +as mount options +.RB ( private , +.BR slave , +.BR shared , +.BR unbindable , +.BR rprivate , +.BR rslave , +.BR rshared , +.BR runbindable ). + +For example: + +.RS +.nf +.B mount \-\-make\-private \-\-make\-unbindable /dev/sda1 /foo +.fi +.RE + +is the same as: + +.RS +.nf +.B mount /dev/sda1 /foox +.B mount \-\-make\-private /foo +.B mount \-\-make\-unbindable /foo +.fi +.RE + +.SH COMMAND-LINE OPTIONS +The full set of mount options used by an invocation of +.B mount +is determined by first extracting the +mount options for the filesystem from the +.I fstab +table, then applying any options specified by the +.B \-o +argument, and finally applying a +.BR \-r " or " \-w +option, when present. + +The command \fBmount\fR does not pass all command-line options to the +\fB/sbin/mount.\fIsuffix\fR mount helpers. The interface between \fBmount\fR +and the mount helpers is described below in the section \fBEXTERNAL HELPERS\fR. +.sp +Command-line options available for the +.B mount +command are: +.TP +.BR \-a , " \-\-all" +Mount all filesystems (of the given types) mentioned in +.I fstab +(except for those whose line contains the +.B noauto +keyword). The filesystems are mounted following their order in +.IR fstab . +The mount command compares filesystem source, target (and fs root for bind +mount or btrfs) to detect already mounted filesystems. The kernel table with +already mounted filesystems is cached during \fBmount \-\-all\fR. It means +that all duplicated fstab entries will be mounted. +.sp +Note that it is a bad practice to use \fBmount \-a\fR for +.I fstab +checking. The recommended solution is \fBfindmnt \-\-verify\fR. +.TP +.BR \-B , " \-\-bind" +Remount a subtree somewhere else (so that its contents are available +in both places). See above, under \fBBind mounts\fR. +.TP +.BR \-c , " \-\-no\-canonicalize" +Don't canonicalize paths. The mount command canonicalizes all paths +(from command line or fstab) by default. This option can be used +together with the +.B \-f +flag for already canonicalized absolute paths. The option is designed for mount +helpers which call \fBmount -i\fR. It is strongly recommended to not use this +command-line option for normal mount operations. +.sp +Note that \fBmount\fR(8) does not pass this option to the +\fB/sbin/mount.\fItype\fR helpers. +.TP +.BR \-F , " \-\-fork" +(Used in conjunction with +.BR \-a .) +Fork off a new incarnation of \fBmount\fR for each device. +This will do the mounts on different devices or different NFS servers +in parallel. +This has the advantage that it is faster; also NFS timeouts go in +parallel. A disadvantage is that the mounts are done in undefined order. +Thus, you cannot use this option if you want to mount both +.I /usr +and +.IR /usr/spool . +.IP "\fB\-f, \-\-fake\fP" +Causes everything to be done except for the actual system call; if it's not +obvious, this ``fakes'' mounting the filesystem. This option is useful in +conjunction with the +.B \-v +flag to determine what the +.B mount +command is trying to do. It can also be used to add entries for devices +that were mounted earlier with the \fB\-n\fR option. The \fB\-f\fR option +checks for an existing record in /etc/mtab and fails when the record already +exists (with a regular non-fake mount, this check is done by the kernel). +.IP "\fB\-i, \-\-internal\-only\fP" +Don't call the \fB/sbin/mount.\fIfilesystem\fR helper even if it exists. +.TP +.BR \-L , " \-\-label " \fIlabel +Mount the partition that has the specified +.IR label . +.TP +.BR \-l , " \-\-show\-labels" +Add the labels in the mount output. \fBmount\fR must have +permission to read the disk device (e.g.\& be set-user-ID root) for this to work. +One can set such a label for ext2, ext3 or ext4 using the +.BR e2label (8) +utility, or for XFS using +.BR xfs_admin (8), +or for reiserfs using +.BR reiserfstune (8). +.TP +.BR \-M , " \-\-move" +Move a subtree to some other place. See above, the subsection +\fBThe move operation\fR. +.TP +.BR \-n , " \-\-no\-mtab" +Mount without writing in +.IR /etc/mtab . +This is necessary for example when +.I /etc +is on a read-only filesystem. +.TP +.BR \-N , " \-\-namespace " \fIns +Perform mount in namespace specified by \fIns\fR. +\fIns\fR is either PID of process running in that namespace +or special file representing that namespace. +.sp +.BR mount (8) +switches to the namespace when it reads /etc/fstab, writes /etc/mtab (or writes to /run/mount) and calls +.BR mount (2) +system call, otherwise it runs in the original namespace. It means that the target namespace does not have +to contain any libraries or another requirements necessary to execute +.BR mount (2) +command. +.sp +See \fBnamespaces\fR(7) for more information. +.TP +.BR \-O , " \-\-test\-opts " \fIopts +Limit the set of filesystems to which the +.B \-a +option applies. In this regard it is like the +.B \-t +option except that +.B \-O +is useless without +.BR \-a . +For example, the command: +.RS +.RS +.sp +.B "mount \-a \-O no_netdev" +.sp +.RE +mounts all filesystems except those which have the option +.I _netdev +specified in the options field in the +.I /etc/fstab +file. + +It is different from +.B \-t +in that each option is matched exactly; a leading +.B no +at the beginning of one option does not negate the rest. + +The +.B \-t +and +.B \-O +options are cumulative in effect; that is, the command +.RS +.sp +.B "mount \-a \-t ext2 \-O _netdev" +.sp +.RE +mounts all ext2 filesystems with the _netdev option, not all filesystems +that are either ext2 or have the _netdev option specified. +.RE +.TP +.BR \-o , " \-\-options " \fIopts +Use the specified mount options. The \fIopts\fR argument is +a comma-separated list. For example: +.RS +.RS +.sp +.B "mount LABEL=mydisk \-o noatime,nodev,nosuid" +.sp +.RE + +For more details, see the +.B FILESYSTEM-INDEPENDENT MOUNT OPTIONS +and +.B FILESYSTEM-SPECIFIC MOUNT OPTIONS +sections. +.RE + +.TP +.BR "\-\-options\-mode " \fImode +Controls how to combine options from fstab/mtab with options from command line. +\fImode\fR can be one of +.BR ignore ", " append ", " prepend " or " replace . +For example \fBappend\fR means that options from fstab are appended to options from command line. +Default value is \fBprepend\fR -- it means command line options are evaluated after fstab options. +Note that the last option wins if there are conflicting ones. + +.TP +.BR "\-\-options\-source " \fIsource +Source of default options. +\fIsource\fR is comma separated list of +.BR fstab ", " mtab " and " disable . +\fBdisable\fR disables +.BR fstab " and " mtab +and disables \fB\-\-options\-source\-force\fR. +Default value is \fBfstab,mtab\fR. + +.TP +.B \-\-options\-source\-force +Use options from fstab/mtab even if both \fIdevice\fR and \fIdir\fR are specified. + +.TP +.BR \-R , " \-\-rbind" +Remount a subtree and all possible submounts somewhere else (so that its +contents are available in both places). See above, the subsection +\fBBind mounts\fR. +.TP +.BR \-r , " \-\-read\-only" +Mount the filesystem read-only. A synonym is +.BR "\-o ro" . +.sp +Note that, depending on the filesystem type, state and kernel behavior, the +system may still write to the device. For example, ext3 and ext4 will replay the +journal if the filesystem is dirty. To prevent this kind of write access, you +may want to mount an ext3 or ext4 filesystem with the \fBro,noload\fR mount +options or set the block device itself to read-only mode, see the +.BR blockdev (8) +command. +.TP +.B \-s +Tolerate sloppy mount options rather than failing. This will ignore mount +options not supported by a filesystem type. Not all filesystems support this +option. Currently it's supported by the \fBmount.nfs\fR mount helper only. +.TP +.BI \-\-source " device" +If only one argument for the mount command is given then the argument might be +interpreted as target (mountpoint) or source (device). This option allows to +explicitly define that the argument is the mount source. +.TP +.BI \-\-target " directory" +If only one argument for the mount command is given then the argument might be +interpreted as target (mountpoint) or source (device). This option allows to +explicitly define that the argument is the mount target. +.TP +.BR \-T , " \-\-fstab " \fIpath +Specifies an alternative fstab file. If \fIpath\fP is a directory then the files +in the directory are sorted by +.BR strverscmp (3); +files that start with "."\& or without an \&.fstab extension are ignored. The option +can be specified more than once. This option is mostly designed for initramfs +or chroot scripts where additional configuration is specified beyond standard +system configuration. +.sp +Note that \fBmount\fR(8) does not pass the option \fB\-\-fstab\fP to the +\fB/sbin/mount.\fItype\fR helpers, meaning that the alternative fstab files will be +invisible for the helpers. This is no problem for normal mounts, but user +(non-root) mounts always require fstab to verify the user's rights. +.TP +.BR \-t , " \-\-types " \fIfstype +The argument following the +.B \-t +is used to indicate the filesystem type. The filesystem types which are +currently supported depend on the running kernel. See +.I /proc/filesystems +and +.I /lib/modules/$(uname -r)/kernel/fs +for a complete list of the filesystems. The most common are ext2, ext3, ext4, +xfs, btrfs, vfat, sysfs, proc, nfs and cifs. +.sp +The programs +.B mount +and +.B umount +support filesystem subtypes. The subtype is defined by a '.subtype' suffix. For +example 'fuse.sshfs'. It's recommended to use subtype notation rather than add +any prefix to the mount source (for example 'sshfs#example.com' is +deprecated). + +If no +.B \-t +option is given, or if the +.B auto +type is specified, mount will try to guess the desired type. +Mount uses the blkid library for guessing the filesystem +type; if that does not turn up anything that looks familiar, +mount will try to read the file +.IR /etc/filesystems , +or, if that does not exist, +.IR /proc/filesystems . +All of the filesystem types listed there will be tried, +except for those that are labeled "nodev" (e.g.\& +.IR devpts , +.I proc +and +.IR nfs ). +If +.I /etc/filesystems +ends in a line with a single *, mount will read +.I /proc/filesystems +afterwards. While trying, all filesystem types will be +mounted with the mount option \fBsilent\fR. +.sp +The +.B auto +type may be useful for user-mounted floppies. +Creating a file +.I /etc/filesystems +can be useful to change the probe order (e.g., to try vfat before msdos +or ext3 before ext2) or if you use a kernel module autoloader. +.sp +More than one type may be specified in a comma-separated +list, for option +.B \-t +as well as in an +.I /etc/fstab +entry. The list of filesystem types for option +.B \-t +can be prefixed with +.B no +to specify the filesystem types on which no action should be taken. +The prefix +.B no +has no effect when specified in an +.I /etc/fstab +entry. +.sp +The prefix +.B no +can be meaningful with the +.B \-a +option. For example, the command +.RS +.RS +.sp +.B "mount \-a \-t nomsdos,smbfs" +.sp +.RE +mounts all filesystems except those of type +.I msdos +and +.IR smbfs . +.sp +For most types all the +.B mount +program has to do is issue a simple +.BR mount (2) +system call, and no detailed knowledge of the filesystem type is required. +For a few types however (like nfs, nfs4, cifs, smbfs, ncpfs) an ad hoc code is +necessary. The nfs, nfs4, cifs, smbfs, and ncpfs filesystems +have a separate mount program. In order to make it possible to +treat all types in a uniform way, \fBmount\fR will execute the program +.BI /sbin/mount. type +(if that exists) when called with type +.IR type . +Since different versions of the +.B smbmount +program have different calling conventions, +.B /sbin/mount.smbfs +may have to be a shell script that sets up the desired call. +.RE +.TP +.BR \-U , " \-\-uuid " \fIuuid +Mount the partition that has the specified +.IR uuid . +.TP +.BR \-v , " \-\-verbose" +Verbose mode. +.TP +.BR \-w , " \-\-rw" , " \-\-read\-write" +Mount the filesystem read/write. The read-write is kernel default. A synonym is +.BR "\-o rw" . + +Note that specify \fB\-w\fR on command line forces \fBmount\fR command +to never try read-only mount on write-protected devices. The default is +try read-only if the previous mount syscall with read-write flags failed. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH FILESYSTEM-INDEPENDENT MOUNT OPTIONS +Some of these options are only useful when they appear in the +.I /etc/fstab +file. + +Some of these options could be enabled or disabled by default +in the system kernel. To check the current setting see the options +in /proc/mounts. Note that filesystems also have per-filesystem +specific default mount options (see for example \fBtune2fs \-l\fP +output for extN filesystems). + +The following options apply to any filesystem that is being +mounted (but not every filesystem actually honors them \(en e.g.\&, the +.B sync +option today has an effect only for ext2, ext3, ext4, fat, vfat and ufs): + +.TP +.B async +All I/O to the filesystem should be done asynchronously. (See also the +.B sync +option.) +.TP +.B atime +Do not use the \fBnoatime\fR feature, so the inode access time is controlled +by kernel defaults. See also the descriptions of the \fB\%relatime\fR and +.B strictatime +mount options. +.TP +.B noatime +Do not update inode access times on this filesystem (e.g.\& for faster +access on the news spool to speed up news servers). This works for all +inode types (directories too), so it implies \fB\%nodiratime\fR. +.TP +.B auto +Can be mounted with the +.B \-a +option. +.TP +.B noauto +Can only be mounted explicitly (i.e., the +.B \-a +option will not cause the filesystem to be mounted). +.TP +.na +.BR context=\fIcontext ", " fscontext=\fIcontext ", " defcontext=\fIcontext ", and " \%rootcontext=\fIcontext +.ad +The +.B context= +option is useful when mounting filesystems that do not support +extended attributes, such as a floppy or hard disk formatted with VFAT, or +systems that are not normally running under SELinux, such as an ext3 or ext4 formatted + +disk from a non-SELinux workstation. You can also use +.B context= +on filesystems you do not trust, such as a floppy. It also helps in compatibility with +xattr-supporting filesystems on earlier 2.4.<x> kernel versions. Even where +xattrs are supported, you can save time not having to label every file by +assigning the entire disk one security context. + +A commonly used option for removable media is +.BR \%context="system_u:object_r:removable_t" . + +Two other options are +.B fscontext= +and +.BR defcontext= , +both of which are mutually exclusive of the context option. This means you +can use fscontext and defcontext with each other, but neither can be used with +context. + +The +.B fscontext= +option works for all filesystems, regardless of their xattr +support. The fscontext option sets the overarching filesystem label to a +specific security context. This filesystem label is separate from the +individual labels on the files. It represents the entire filesystem for +certain kinds of permission checks, such as during mount or file creation. +Individual file labels are still obtained from the xattrs on the files +themselves. The context option actually sets the aggregate context that +fscontext provides, in addition to supplying the same label for individual +files. + +You can set the default security context for unlabeled files using +.B defcontext= +option. This overrides the value set for unlabeled files in the policy and requires a +filesystem that supports xattr labeling. + +The +.B rootcontext= +option allows you to explicitly label the root inode of a FS being mounted +before that FS or inode becomes visible to userspace. This was found to be +useful for things like stateless linux. + +Note that the kernel rejects any remount request that includes the context +option, \fBeven\fP when unchanged from the current context. + +.BR "Warning: the \fIcontext\fP value might contain commas" , +in which case the value has to be properly quoted, otherwise +.BR mount (8) +will interpret the comma as a separator between mount options. Don't forget that +the shell strips off quotes and thus +.BR "double quoting is required" . +For example: +.RS +.RS +.sp +.nf +.B mount \-t tmpfs none /mnt \-o \e +.B \ \ 'context="system_u:object_r:tmp_t:s0:c127,c456",noexec' +.fi +.sp +.RE +For more details, see +.BR selinux (8). +.RE + +.TP +.B defaults +Use the default options: +.BR rw ", " suid ", " dev ", " exec ", " auto ", " nouser ", and " async . + +Note that the real set of all default mount options depends on kernel +and filesystem type. See the beginning of this section for more details. +.TP +.B dev +Interpret character or block special devices on the filesystem. +.TP +.B nodev +Do not interpret character or block special devices on the file +system. +.TP +.B diratime +Update directory inode access times on this filesystem. This is the default. +(This option is ignored when \fBnoatime\fR is set.) +.TP +.B nodiratime +Do not update directory inode access times on this filesystem. +(This option is implied when \fBnoatime\fR is set.) +.TP +.B dirsync +All directory updates within the filesystem should be done synchronously. +This affects the following system calls: creat, link, unlink, symlink, +mkdir, rmdir, mknod and rename. +.TP +.B exec +Permit execution of binaries. +.TP +.B noexec +Do not permit direct execution of any binaries on the mounted filesystem. +.TP +.B group +Allow an ordinary user to mount the filesystem if one +of that user's groups matches the group of the device. +This option implies the options +.BR nosuid " and " nodev +(unless overridden by subsequent options, as in the option line +.BR group,dev,suid ). +.TP +.B iversion +Every time the inode is modified, the i_version field will be incremented. +.TP +.B noiversion +Do not increment the i_version inode field. +.TP +.B mand +Allow mandatory locks on this filesystem. See +.BR fcntl (2). +.TP +.B nomand +Do not allow mandatory locks on this filesystem. +.TP +.B _netdev +The filesystem resides on a device that requires network access +(used to prevent the system from attempting to mount these filesystems +until the network has been enabled on the system). +.TP +.B nofail +Do not report errors for this device if it does not exist. +.TP +.B relatime +Update inode access times relative to modify or change time. Access +time is only updated if the previous access time was earlier than the +current modify or change time. (Similar to \fB\%noatime\fR, but it doesn't +break \fBmutt\fR or other applications that need to know if a file has been +read since the last time it was modified.) + +Since Linux 2.6.30, the kernel defaults to the behavior provided by this +option (unless +.B \%noatime +was specified), and the +.B \%strictatime +option is required to obtain traditional semantics. In addition, since Linux +2.6.30, the file's last access time is always updated if it is more than 1 +day old. +.TP +.B norelatime +Do not use the +.B relatime +feature. See also the +.B strictatime +mount option. +.TP +.B strictatime +Allows to explicitly request full atime updates. This makes it +possible for the kernel to default to +.B \%relatime +or +.B \%noatime +but still allow userspace to override it. For more details about the default +system mount options see /proc/mounts. +.TP +.B nostrictatime +Use the kernel's default behavior for inode access time updates. +.TP +.B lazytime +Only update times (atime, mtime, ctime) on the in-memory version of the file inode. + +This mount option significantly reduces writes to the inode table for +workloads that perform frequent random writes to preallocated files. + +The on-disk timestamps are updated only when: +.sp +.RS +- the inode needs to be updated for some change unrelated to file timestamps +.sp +- the application employs +.BR fsync (2), +.BR syncfs (2), +or +.BR sync (2) +.sp +- an undeleted inode is evicted from memory +.sp +- more than 24 hours have passed since the i-node was written to disk. +.RE +.sp +.TP +.B nolazytime +Do not use the lazytime feature. +.TP +.B suid +Allow set-user-ID or set-group-ID bits to take +effect. +.TP +.B nosuid +Do not allow set-user-ID or set-group-ID bits to take +effect. +.TP +.B silent +Turn on the silent flag. +.TP +.B loud +Turn off the silent flag. +.TP +.B owner +Allow an ordinary user to mount the filesystem if that +user is the owner of the device. +This option implies the options +.BR nosuid " and " nodev +(unless overridden by subsequent options, as in the option line +.BR owner,dev,suid ). +.TP +.B remount +Attempt to remount an already-mounted filesystem. This is commonly +used to change the mount flags for a filesystem, especially to make a +readonly filesystem writable. It does not change device or mount point. + +The remount operation together with the +.B bind +flag has special semantic. See above, the subsection \fBBind mounts\fR. + +The remount functionality follows the standard way the mount command works +with options from fstab. This means that \fBmount\fR does not +read fstab (or mtab) only when both +.I device +and +.I dir +are specified. +.sp +.in +4 +.B "mount \-o remount,rw /dev/foo /dir" +.in +.sp +After this call all old mount options are replaced and arbitrary stuff from +fstab (or mtab) is ignored, except the loop= option which is internally +generated and maintained by the mount command. +.sp +.in +4 +.B "mount \-o remount,rw /dir" +.in +.sp +After this call, mount reads fstab and merges these options with +the options from the command line (\fB\-o\fR). +If no mountpoint is found in fstab, then a remount with unspecified source is +allowed. +.TP +.B ro +Mount the filesystem read-only. +.TP +.B rw +Mount the filesystem read-write. +.TP +.B sync +All I/O to the filesystem should be done synchronously. In the case of +media with a limited number of write cycles +(e.g.\& some flash drives), \fBsync\fR may cause life-cycle shortening. +.TP +.B user +Allow an ordinary user to mount the filesystem. +The name of the mounting user is written to the mtab file (or to the private +libmount file in /run/mount on systems without a regular mtab) so that this +same user can unmount the filesystem again. +This option implies the options +.BR noexec ", " nosuid ", and " nodev +(unless overridden by subsequent options, as in the option line +.BR user,exec,dev,suid ). +.TP +.B nouser +Forbid an ordinary user to mount the filesystem. +This is the default; it does not imply any other options. +.TP +.B users +Allow any user to mount and to unmount the filesystem, even +when some other ordinary user mounted it. +This option implies the options +.BR noexec ", " nosuid ", and " nodev +(unless overridden by subsequent options, as in the option line +.BR users,exec,dev,suid ). +.TP +.B X-* +All options prefixed with "X-" are interpreted as comments or as userspace +application-specific options. These options are not stored in the user space (e.g. mtab file), +nor sent to the mount.\fItype\fR helpers nor to the +.BR mount (2) +system call. The suggested format is \fBX-\fIappname\fR.\fIoption\fR. +.TP +.B x-* +The same as \fBX-*\fR options, but stored permanently in the user space. It +means the options are also available for umount or another operations. Note +that maintain mount options in user space is tricky, because it's necessary use +libmount based tools and there is no guarantee that the options will be always +available (for example after a move mount operation or in unshared namespace). + +Note that before util-linux v2.30 the x-* options have not been maintained by +libmount and stored in user space (functionality was the same as have X-* now), +but due to growing number of use-cases (in initrd, systemd etc.) the +functionality have been extended to keep existing fstab configurations usable +without a change. +.TP +.BR X-mount.mkdir [ = \fImode\fR ] +Allow to make a target directory (mountpoint). The optional argument +.I mode +specifies the filesystem access mode used for +.BR mkdir (2) +in octal notation. The default mode is 0755. This functionality is supported +only for root users. The option is also supported as x-mount.mkdir, this notation +is deprecated for mount.mkdir since v2.30. + +.SH "FILESYSTEM-SPECIFIC MOUNT OPTIONS" +You should consult the respective man page for the filesystem first. +If you want to know what options the ext4 filesystem supports, then check the +.BR ext4 (5) +man page. +If that doesn't exist, you can also check the corresponding mount page like +.BR mount.cifs (8). +Note that you might have to install the respective userland tools. +.sp +The following options apply only to certain filesystems. +We sort them by filesystem. They all follow the +.B \-o +flag. +.sp +What options are supported depends a bit on the running kernel. +More info may be found in the kernel source subdirectory +.IR Documentation/filesystems . + +.SS "Mount options for adfs" +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +Set the owner and group of the files in the filesystem (default: uid=gid=0). +.TP +\fBownmask=\fP\,\fIvalue\fP and \fBothmask=\fP\,\fIvalue\fP +Set the permission mask for ADFS 'owner' permissions and 'other' permissions, +respectively (default: 0700 and 0077, respectively). +See also +.IR /usr/src/linux/Documentation/filesystems/adfs.txt . + +.SS "Mount options for affs" +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +Set the owner and group of the root of the filesystem (default: uid=gid=0, +but with option +.B uid +or +.B gid +without specified value, the UID and GID of the current process are taken). +.TP +\fBsetuid=\fP\,\fIvalue\fP and \fBsetgid=\fP\,\fIvalue\fP +Set the owner and group of all files. +.TP +.BI mode= value +Set the mode of all files to +.IR value " & 0777" +disregarding the original permissions. +Add search permission to directories that have read permission. +The value is given in octal. +.TP +.B protect +Do not allow any changes to the protection bits on the filesystem. +.TP +.B usemp +Set UID and GID of the root of the filesystem to the UID and GID +of the mount point upon the first sync or umount, and then +clear this option. Strange... +.TP +.B verbose +Print an informational message for each successful mount. +.TP +.BI prefix= string +Prefix used before volume name, when following a link. +.TP +.BI volume= string +Prefix (of length at most 30) used before '/' when following a symbolic link. +.TP +.BI reserved= value +(Default: 2.) Number of unused blocks at the start of the device. +.TP +.BI root= value +Give explicitly the location of the root block. +.TP +.BI bs= value +Give blocksize. Allowed values are 512, 1024, 2048, 4096. +.TP +.BR grpquota | noquota | quota | usrquota +These options are accepted but ignored. +(However, quota utilities may react to such strings in +.IR /etc/fstab .) + +.SS "Mount options for debugfs" +The debugfs filesystem is a pseudo filesystem, traditionally mounted on +.IR /sys/kernel/debug . +.\" or just /debug +.\" present since 2.6.11 +As of kernel version 3.4, debugfs has the following options: +.TP +.BI uid= n ", gid=" n +Set the owner and group of the mountpoint. +.TP +.BI mode= value +Sets the mode of the mountpoint. + +.SS "Mount options for devpts" +The devpts filesystem is a pseudo filesystem, traditionally mounted on +.IR /dev/pts . +In order to acquire a pseudo terminal, a process opens +.IR /dev/ptmx ; +the number of the pseudo terminal is then made available to the process +and the pseudo terminal slave can be accessed as +.IR /dev/pts/ <number>. +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +This sets the owner or the group of newly created PTYs to +the specified values. When nothing is specified, they will +be set to the UID and GID of the creating process. +For example, if there is a tty group with GID 5, then +.B gid=5 +will cause newly created PTYs to belong to the tty group. +.TP +.BI mode= value +Set the mode of newly created PTYs to the specified value. +The default is 0600. +A value of +.B mode=620 +and +.B gid=5 +makes "mesg y" the default on newly created PTYs. +.TP +\fBnewinstance +Create a private instance of devpts filesystem, such that +indices of ptys allocated in this new instance are +independent of indices created in other instances of devpts. + +All mounts of devpts without this +.B newinstance +option share the same set of pty indices (i.e. legacy mode). +Each mount of devpts with the +.B newinstance +option has a private set of pty indices. + +This option is mainly used to support containers in the +linux kernel. It is implemented in linux kernel versions +starting with 2.6.29. Further, this mount option is valid +only if CONFIG_DEVPTS_MULTIPLE_INSTANCES is enabled in the +kernel configuration. + +To use this option effectively, +.I /dev/ptmx +must be a symbolic link to +.I pts/ptmx. +See +.I Documentation/filesystems/devpts.txt +in the linux kernel source tree for details. +.TP +.BI ptmxmode= value + +Set the mode for the new +.I ptmx +device node in the devpts filesystem. + +With the support for multiple instances of devpts (see +.B newinstance +option above), each instance has a private +.I ptmx +node in the root of the devpts filesystem (typically +.IR /dev/pts/ptmx ). + +For compatibility with older versions of the kernel, the +default mode of the new +.I ptmx +node is 0000. +.BI ptmxmode= value +specifies a more useful mode for the +.I ptmx +node and is highly recommended when the +.B newinstance +option is specified. + +This option is only implemented in linux kernel versions +starting with 2.6.29. Further, this option is valid only if +CONFIG_DEVPTS_MULTIPLE_INSTANCES is enabled in the kernel +configuration. + +.SS "Mount options for fat" +(Note: +.I fat +is not a separate filesystem, but a common part of the +.IR msdos , +.I umsdos +and +.I vfat +filesystems.) +.TP +.BR blocksize= { 512 | 1024 | 2048 } +Set blocksize (default 512). This option is obsolete. +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +Set the owner and group of all files. +(Default: the UID and GID of the current process.) +.TP +.BI umask= value +Set the umask (the bitmask of the permissions that are +.B not +present). The default is the umask of the current process. +The value is given in octal. +.TP +.BI dmask= value +Set the umask applied to directories only. +The default is the umask of the current process. +The value is given in octal. +.TP +.BI fmask= value +Set the umask applied to regular files only. +The default is the umask of the current process. +The value is given in octal. +.TP +.BI allow_utime= value +This option controls the permission check of mtime/atime. +.RS +.TP +.B 20 +If current process is in group of file's group ID, you can change timestamp. +.TP +.B 2 +Other users can change timestamp. +.PP +The default is set from `dmask' option. (If the directory is writable, +.BR utime (2) +is also allowed. I.e.\& \s+3~\s0dmask & 022) + +Normally +.BR utime (2) +checks current process is owner of the file, or it has +CAP_FOWNER capability. But FAT filesystem doesn't have UID/GID on disk, so +normal check is too inflexible. With this option you can relax it. +.RE +.TP +.BI check= value +Three different levels of pickiness can be chosen: +.RS +.TP +.BR r [ elaxed ] +Upper and lower case are accepted and equivalent, long name parts are +truncated (e.g.\& +.I verylongname.foobar +becomes +.IR verylong.foo ), +leading and embedded spaces are accepted in each name part (name and extension). +.TP +.BR n [ ormal ] +Like "relaxed", but many special characters (*, ?, <, spaces, etc.) are +rejected. This is the default. +.TP +.BR s [ trict ] +Like "normal", but names that contain long parts or special characters +that are sometimes used on Linux but are not accepted by MS-DOS +(+, =, etc.) are rejected. +.RE +.TP +.BI codepage= value +Sets the codepage for converting to shortname characters on FAT +and VFAT filesystems. By default, codepage 437 is used. +.TP +.BI conv= mode +This option is obsolete and may fail or being ignored. +.TP +.BI cvf_format= module +Forces the driver to use the CVF (Compressed Volume File) module +.RI cvf_ module +instead of auto-detection. If the kernel supports kmod, the +cvf_format=xxx option also controls on-demand CVF module loading. +This option is obsolete. +.TP +.BI cvf_option= option +Option passed to the CVF module. This option is obsolete. +.TP +.B debug +Turn on the +.I debug +flag. A version string and a list of filesystem parameters will be +printed (these data are also printed if the parameters appear to be +inconsistent). +.TP +.B discard +If set, causes discard/TRIM commands to be issued to the block device +when blocks are freed. This is useful for SSD devices and +sparse/thinly-provisioned LUNs. +.TP +.B dos1xfloppy +If set, use a fallback default BIOS Parameter Block configuration, determined +by backing device size. These static parameters match defaults assumed by DOS +1.x for 160 kiB, 180 kiB, 320 kiB, and 360 kiB floppies and floppy images. +.TP +.BR errors= { panic | continue | remount-ro } +Specify FAT behavior on critical errors: panic, continue without doing +anything, or remount the partition in read-only mode (default behavior). +.TP +.BR fat= { 12 | 16 | 32 } +Specify a 12, 16 or 32 bit fat. This overrides +the automatic FAT type detection routine. Use with caution! +.TP +.BI iocharset= value +Character set to use for converting between 8 bit characters +and 16 bit Unicode characters. The default is iso8859-1. +Long filenames are stored on disk in Unicode format. +.TP +.BR nfs= { stale_rw | nostale_ro } +Enable this only if you want to export the FAT filesystem over NFS. + +.BR stale_rw : +This option maintains an index (cache) of directory inodes which is used by the +nfs-related code to improve look-ups. Full file operations (read/write) over +NFS are supported but with cache eviction at NFS server, this could result in +spurious +.B ESTALE +errors. + +.BR nostale_ro : +This option bases the inode number and file handle +on the on-disk location of a file in the FAT directory entry. +This ensures that +.B ESTALE +will not be returned after a file is +evicted from the inode cache. However, it means that operations +such as rename, create and unlink could cause file handles that +previously pointed at one file to point at a different file, +potentially causing data corruption. For this reason, this +option also mounts the filesystem readonly. + +To maintain backward compatibility, '-o nfs' is also accepted, +defaulting to +.BR stale_rw . +.TP +.B tz=UTC +This option disables the conversion of timestamps +between local time (as used by Windows on FAT) and UTC +(which Linux uses internally). This is particularly +useful when mounting devices (like digital cameras) +that are set to UTC in order to avoid the pitfalls of +local time. +.TP +.BI time_offset= minutes +Set offset for conversion of timestamps from local time used by FAT to UTC. +I.e., +.I minutes +will be subtracted from each timestamp to convert it to UTC used +internally by Linux. This is useful when the time zone set in the kernel via +.BR settimeofday (2) +is not the time zone used by the filesystem. Note +that this option still does not provide correct time stamps in all cases in +presence of DST - time stamps in a different DST setting will be off by one +hour. +.TP +.B quiet +Turn on the +.I quiet +flag. Attempts to chown or chmod files do not return errors, +although they fail. Use with caution! +.TP +.B rodir +FAT has the ATTR_RO (read-only) attribute. On Windows, the ATTR_RO of the +directory will just be ignored, and is used only by applications as a flag +(e.g.\& it's set for the customized folder). + +If you want to use ATTR_RO as read-only flag even for the directory, set this +option. +.TP +.B showexec +If set, the execute permission bits of the file will be allowed only if +the extension part of the name is \&.EXE, \&.COM, or \&.BAT. Not set by default. +.TP +.B sys_immutable +If set, ATTR_SYS attribute on FAT is handled as IMMUTABLE flag on Linux. +Not set by default. +.TP +.B flush +If set, the filesystem will try to flush to disk more early than normal. +Not set by default. +.TP +.B usefree +Use the "free clusters" value stored on FSINFO. It'll +be used to determine number of free clusters without +scanning disk. But it's not used by default, because +recent Windows don't update it correctly in some +case. If you are sure the "free clusters" on FSINFO is +correct, by this option you can avoid scanning disk. +.TP +.BR dots ", " nodots ", " dotsOK= [ yes | no ] +Various misguided attempts to force Unix or DOS conventions +onto a FAT filesystem. + +.SS "Mount options for hfs" +.TP +.BI creator= cccc ", type=" cccc +Set the creator/type values as shown by the MacOS finder +used for creating new files. Default values: '????'. +.TP +.BI uid= n ", gid=" n +Set the owner and group of all files. +(Default: the UID and GID of the current process.) +.TP +.BI dir_umask= n ", file_umask=" n ", umask=" n +Set the umask used for all directories, all regular files, or all +files and directories. Defaults to the umask of the current process. +.TP +.BI session= n +Select the CDROM session to mount. +Defaults to leaving that decision to the CDROM driver. +This option will fail with anything but a CDROM as underlying device. +.TP +.BI part= n +Select partition number n from the device. +Only makes sense for CDROMs. +Defaults to not parsing the partition table at all. +.TP +.B quiet +Don't complain about invalid mount options. + +.SS "Mount options for hpfs" +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +Set the owner and group of all files. (Default: the UID and GID +of the current process.) +.TP +.BI umask= value +Set the umask (the bitmask of the permissions that are +.B not +present). The default is the umask of the current process. +The value is given in octal. +.TP +.BR case= { lower | asis } +Convert all files names to lower case, or leave them. +(Default: +.BR case=lower .) +.TP +.BI conv= mode +This option is obsolete and may fail or being ignored. +.TP +.B nocheck +Do not abort mounting when certain consistency checks fail. + +.SS "Mount options for iso9660" +ISO 9660 is a standard describing a filesystem structure to be used +on CD-ROMs. (This filesystem type is also seen on some DVDs. See also the +.I udf +filesystem.) + +Normal +.I iso9660 +filenames appear in an 8.3 format (i.e., DOS-like restrictions on filename +length), and in addition all characters are in upper case. Also there is +no field for file ownership, protection, number of links, provision for +block/character devices, etc. + +Rock Ridge is an extension to iso9660 that provides all of these UNIX-like +features. Basically there are extensions to each directory record that +supply all of the additional information, and when Rock Ridge is in use, +the filesystem is indistinguishable from a normal UNIX filesystem (except +that it is read-only, of course). +.TP +.B norock +Disable the use of Rock Ridge extensions, even if available. Cf.\& +.BR map . +.TP +.B nojoliet +Disable the use of Microsoft Joliet extensions, even if available. Cf.\& +.BR map . +.TP +.BR check= { r [ elaxed ]| s [ trict ]} +With +.BR check=relaxed , +a filename is first converted to lower case before doing the lookup. +This is probably only meaningful together with +.B norock +and +.BR map=normal . +(Default: +.BR check=strict .) +.TP +\fBuid=\fP\,\fIvalue\fP and \fBgid=\fP\,\fIvalue\fP +Give all files in the filesystem the indicated user or group id, +possibly overriding the information found in the Rock Ridge extensions. +(Default: +.BR uid=0,gid=0 .) +.TP +.BR map= { n [ ormal ]| o [ ff ]| a [ corn ]} +For non-Rock Ridge volumes, normal name translation maps upper +to lower case ASCII, drops a trailing `;1', and converts `;' to `.'. +With +.B map=off +no name translation is done. See +.BR norock . +(Default: +.BR map=normal .) +.B map=acorn +is like +.B map=normal +but also apply Acorn extensions if present. +.TP +.BI mode= value +For non-Rock Ridge volumes, give all files the indicated mode. +(Default: read and execute permission for everybody.) +Octal mode values require a leading 0. +.TP +.B unhide +Also show hidden and associated files. +(If the ordinary files and the associated or hidden files have +the same filenames, this may make the ordinary files inaccessible.) +.TP +.BR block= { 512 | 1024 | 2048 } +Set the block size to the indicated value. +(Default: +.BR block=1024 .) +.TP +.BI conv= mode +This option is obsolete and may fail or being ignored. +.TP +.B cruft +If the high byte of the file length contains other garbage, +set this mount option to ignore the high order bits of the file length. +This implies that a file cannot be larger than 16\ MB. +.TP +.BI session= x +Select number of session on multisession CD. +.TP +.BI sbsector= xxx +Session begins from sector xxx. +.LP +The following options are the same as for vfat and specifying them only makes +sense when using discs encoded using Microsoft's Joliet extensions. +.TP +.BI iocharset= value +Character set to use for converting 16 bit Unicode characters on CD +to 8 bit characters. The default is iso8859-1. +.TP +.B utf8 +Convert 16 bit Unicode characters on CD to UTF-8. + +.SS "Mount options for jfs" +.TP +.BI iocharset= name +Character set to use for converting from Unicode to ASCII. The default is +to do no conversion. Use +.B iocharset=utf8 +for UTF8 translations. This requires CONFIG_NLS_UTF8 to be set in +the kernel +.I ".config" +file. +.TP +.BI resize= value +Resize the volume to +.I value +blocks. JFS only supports growing a volume, not shrinking it. This option +is only valid during a remount, when the volume is mounted read-write. The +.B resize +keyword with no value will grow the volume to the full size of the partition. +.TP +.B nointegrity +Do not write to the journal. The primary use of this option is to allow +for higher performance when restoring a volume from backup media. The +integrity of the volume is not guaranteed if the system abnormally ends. +.TP +.B integrity +Default. Commit metadata changes to the journal. Use this option to remount +a volume where the +.B nointegrity +option was previously specified in order to restore normal behavior. +.TP +.BR errors= { continue | remount-ro | panic } +Define the behavior when an error is encountered. +(Either ignore errors and just mark the filesystem erroneous and continue, +or remount the filesystem read-only, or panic and halt the system.) +.TP +.BR noquota | quota | usrquota | grpquota +These options are accepted but ignored. + +.SS "Mount options for msdos" +See mount options for fat. +If the +.I msdos +filesystem detects an inconsistency, it reports an error and sets the file +system read-only. The filesystem can be made writable again by remounting +it. + +.SS "Mount options for ncpfs" +Just like +.IR nfs ", the " ncpfs +implementation expects a binary argument (a +.IR "struct ncp_mount_data" ) +to the mount system call. This argument is constructed by +.BR ncpmount (8) +and the current version of +.B mount +(2.12) does not know anything about ncpfs. + +.SS "Mount options for ntfs" +.TP +.BI iocharset= name +Character set to use when returning file names. +Unlike VFAT, NTFS suppresses names that contain +nonconvertible characters. Deprecated. +.TP +.BI nls= name +New name for the option earlier called +.IR iocharset . +.TP +.B utf8 +Use UTF-8 for converting file names. +.TP +.BR uni_xlate= { 0 | 1 | 2 } +For 0 (or `no' or `false'), do not use escape sequences +for unknown Unicode characters. +For 1 (or `yes' or `true') or 2, use vfat-style 4-byte escape sequences +starting with ":". Here 2 give a little-endian encoding +and 1 a byteswapped bigendian encoding. +.TP +.B posix=[0|1] +If enabled (posix=1), the filesystem distinguishes between +upper and lower case. The 8.3 alias names are presented as +hard links instead of being suppressed. This option is obsolete. +.TP +\fBuid=\fP\,\fIvalue\fP, \fBgid=\fP\,\fIvalue\fP and \fBumask=\fP\,\fIvalue\fP +Set the file permission on the filesystem. +The umask value is given in octal. +By default, the files are owned by root and not readable by somebody else. + +.SS "Mount options for overlay" +Since Linux 3.18 the overlay pseudo filesystem implements a union mount for +other filesystems. + +An overlay filesystem combines two filesystems - an \fBupper\fR filesystem and +a \fBlower\fR filesystem. When a name exists in both filesystems, the object +in the upper filesystem is visible while the object in the lower filesystem is +either hidden or, in the case of directories, merged with the upper object. + +The lower filesystem can be any filesystem supported by Linux and does not need +to be writable. The lower filesystem can even be another overlayfs. The upper +filesystem will normally be writable and if it is it must support the creation +of trusted.* extended attributes, and must provide a valid d_type in readdir +responses, so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any filesystem type. +The options \fBlowerdir\fR and \fBupperdir\fR are combined into a merged +directory by using: + +.RS +.br +.nf +.B "mount \-t overlay overlay \e" +.B " \-olowerdir=/lower,upperdir=/upper,workdir=/work /merged" +.fi +.br +.RE + +.TP +.BI lowerdir= directory +Any filesystem, does not need to be on a writable filesystem. +.TP +.BI upperdir= directory +The upperdir is normally on a writable filesystem. +.TP +.BI workdir= directory +The workdir needs to be an empty directory on the same filesystem as upperdir. + +.SS "Mount options for reiserfs" +Reiserfs is a journaling filesystem. +.TP +.B conv +Instructs version 3.6 reiserfs software to mount a version 3.5 filesystem, +using the 3.6 format for newly created objects. This filesystem will no +longer be compatible with reiserfs 3.5 tools. +.TP +.BR hash= { rupasov | tea | r5 | detect } +Choose which hash function reiserfs will use to find files within directories. +.RS +.TP +.B rupasov +A hash invented by Yury Yu.\& Rupasov. It is fast and preserves locality, +mapping lexicographically close file names to close hash values. +This option should not be used, as it causes a high probability of hash +collisions. +.TP +.B tea +A Davis-Meyer function implemented by Jeremy Fitzhardinge. +It uses hash permuting bits in the name. It gets high randomness +and, therefore, low probability of hash collisions at some CPU cost. +This may be used if EHASHCOLLISION errors are experienced with the r5 hash. +.TP +.B r5 +A modified version of the rupasov hash. It is used by default and is +the best choice unless the filesystem has huge directories and +unusual file-name patterns. +.TP +.B detect +Instructs +.I mount +to detect which hash function is in use by examining +the filesystem being mounted, and to write this information into +the reiserfs superblock. This is only useful on the first mount of +an old format filesystem. +.RE +.TP +.B hashed_relocation +Tunes the block allocator. This may provide performance improvements +in some situations. +.TP +.B no_unhashed_relocation +Tunes the block allocator. This may provide performance improvements +in some situations. +.TP +.B noborder +Disable the border allocator algorithm invented by Yury Yu.\& Rupasov. +This may provide performance improvements in some situations. +.TP +.B nolog +Disable journaling. This will provide slight performance improvements in +some situations at the cost of losing reiserfs's fast recovery from crashes. +Even with this option turned on, reiserfs still performs all journaling +operations, save for actual writes into its journaling area. Implementation +of +.I nolog +is a work in progress. +.TP +.B notail +By default, reiserfs stores small files and `file tails' directly into its +tree. This confuses some utilities such as +.BR LILO (8). +This option is used to disable packing of files into the tree. +.TP +.B replayonly +Replay the transactions which are in the journal, but do not actually +mount the filesystem. Mainly used by +.IR reiserfsck . +.TP +.BI resize= number +A remount option which permits online expansion of reiserfs partitions. +Instructs reiserfs to assume that the device has +.I number +blocks. +This option is designed for use with devices which are under logical +volume management (LVM). +There is a special +.I resizer +utility which can be obtained from +.IR ftp://ftp.namesys.com/pub/reiserfsprogs . +.TP +.B user_xattr +Enable Extended User Attributes. See the +.BR attr (5) +manual page. +.TP +.B acl +Enable POSIX Access Control Lists. See the +.BR acl (5) +manual page. +.TP +.BR barrier=none " / " barrier=flush " +This disables / enables the use of write barriers in the journaling code. +barrier=none disables, barrier=flush enables (default). This also requires an +IO stack which can support barriers, and if reiserfs gets an error on a barrier +write, it will disable barriers again with a warning. Write barriers enforce +proper on-disk ordering of journal commits, making volatile disk write caches +safe to use, at some performance penalty. If your disks are battery-backed in +one way or another, disabling barriers may safely improve performance. + +.SS "Mount options for ubifs" +UBIFS is a flash filesystem which works on top of UBI volumes. Note that +\fBatime\fR is not supported and is always turned off. +.TP +The device name may be specified as +.RS +.B ubiX_Y +UBI device number +.BR X , +volume number +.B Y +.TP +.B ubiY +UBI device number +.BR 0 , +volume number +.B Y +.TP +.B ubiX:NAME +UBI device number +.BR X , +volume with name +.B NAME +.TP +.B ubi:NAME +UBI device number +.BR 0 , +volume with name +.B NAME +.RE +Alternative +.B ! +separator may be used instead of +.BR : . +.TP +The following mount options are available: +.TP +.B bulk_read +Enable bulk-read. VFS read-ahead is disabled because it slows down the file +system. Bulk-Read is an internal optimization. Some flashes may read faster if +the data are read at one go, rather than at several read requests. For +example, OneNAND can do "read-while-load" if it reads more than one NAND page. +.TP +.B no_bulk_read +Do not bulk-read. This is the default. +.TP +.B chk_data_crc +Check data CRC-32 checksums. This is the default. +.TP +.BR no_chk_data_crc . +Do not check data CRC-32 checksums. With this option, the filesystem does not +check CRC-32 checksum for data, but it does check it for the internal indexing +information. This option only affects reading, not writing. CRC-32 is always +calculated when writing the data. +.TP +.BR compr= { none | lzo | zlib } +Select the default compressor which is used when new files are written. It is +still possible to read compressed files if mounted with the +.B none +option. + +.SS "Mount options for udf" +UDF is the "Universal Disk Format" filesystem defined by OSTA, the Optical +Storage Technology Association, and is often used for DVD-ROM, frequently +in the form of a hybrid UDF/ISO-9660 filesystem. It is, however, +perfectly usable by itself on disk drives, flash drives and other block devices. +See also +.IR iso9660 . +.TP +.B uid= +Make all files in the filesystem belong to the given user. +uid=forget can be specified independently of (or usually in +addition to) uid=<user> and results in UDF +not storing uids to the media. In fact the recorded uid +is the 32-bit overflow uid -1 as defined by the UDF standard. +The value is given as either <user> which is a valid user name or the corresponding +decimal user id, or the special string "forget". +.TP +.B gid= +Make all files in the filesystem belong to the given group. +gid=forget can be specified independently of (or usually in +addition to) gid=<group> and results in UDF +not storing gids to the media. In fact the recorded gid +is the 32-bit overflow gid -1 as defined by the UDF standard. +The value is given as either <group> which is a valid group name or the corresponding +decimal group id, or the special string "forget". +.TP +.B umask= +Mask out the given permissions from all inodes read from the filesystem. +The value is given in octal. +.TP +.B mode= +If mode= is set the permissions of all non-directory inodes read from the +filesystem will be set to the given mode. The value is given in octal. +.TP +.B dmode= +If dmode= is set the permissions of all directory inodes read from the +filesystem will be set to the given dmode. The value is given in octal. +.TP +.B bs= +Set the block size. Default value prior to kernel version 2.6.30 was +2048. Since 2.6.30 and prior to 4.11 it was logical device block size with +fallback to 2048. Since 4.11 it is logical block size with fallback to +any valid block size between logical device block size and 4096. + +For other details see the \fBmkudffs\fP(8) 2.0+ manpage, sections +\fBCOMPATIBILITY\fP and \fBBLOCK SIZE\fP. +.TP +.B unhide +Show otherwise hidden files. +.TP +.B undelete +Show deleted files in lists. +.TP +.B adinicb +Embed data in the inode. (default) +.TP +.B noadinicb +Don't embed data in the inode. +.TP +.B shortad +Use short UDF address descriptors. +.TP +.B longad +Use long UDF address descriptors. (default) +.TP +.B nostrict +Unset strict conformance. +.TP +.B iocharset= +Set the NLS character set. This requires kernel compiled with CONFIG_UDF_NLS option. +.TP +.B utf8 +Set the UTF-8 character set. +.SS Mount options for debugging and disaster recovery +.TP +.B novrs +Ignore the Volume Recognition Sequence and attempt to mount anyway. +.TP +.B session= +Select the session number for multi-session recorded optical media. (default= last session) +.TP +.B anchor= +Override standard anchor location. (default= 256) +.TP +.B lastblock= +Set the last block of the filesystem. +.SS Unused historical mount options that may be encountered and should be removed +.TP +.B uid=ignore +Ignored, use uid=<user> instead. +.TP +.B gid=ignore +Ignored, use gid=<group> instead. +.TP +.B volume= +Unimplemented and ignored. +.TP +.B partition= +Unimplemented and ignored. +.TP +.B fileset= +Unimplemented and ignored. +.TP +.B rootdir= +Unimplemented and ignored. + +.SS "Mount options for ufs" +.TP +.BI ufstype= value +UFS is a filesystem widely used in different operating systems. +The problem are differences among implementations. Features of some +implementations are undocumented, so its hard to recognize the +type of ufs automatically. +That's why the user must specify the type of ufs by mount option. +Possible values are: +.RS +.TP +.B old +Old format of ufs, this is the default, read only. +(Don't forget to give the \-r option.) +.TP +.B 44bsd +For filesystems created by a BSD-like system (NetBSD, FreeBSD, OpenBSD). +.TP +.B ufs2 +Used in FreeBSD 5.x supported as read-write. +.TP +.B 5xbsd +Synonym for ufs2. +.TP +.B sun +For filesystems created by SunOS or Solaris on Sparc. +.TP +.B sunx86 +For filesystems created by Solaris on x86. +.TP +.B hp +For filesystems created by HP-UX, read-only. +.TP +.B nextstep +For filesystems created by NeXTStep (on NeXT station) (currently read only). +.TP +.B nextstep-cd +For NextStep CDROMs (block_size == 2048), read-only. +.TP +.B openstep +For filesystems created by OpenStep (currently read only). +The same filesystem type is also used by Mac OS X. +.RE + +.TP +.BI onerror= value +Set behavior on error: +.RS +.TP +.B panic +If an error is encountered, cause a kernel panic. +.TP +.RB [ lock | umount | repair ] +These mount options don't do anything at present; +when an error is encountered only a console message is printed. +.RE + +.SS "Mount options for umsdos" +See mount options for msdos. +The +.B dotsOK +option is explicitly killed by +.IR umsdos . + +.SS "Mount options for vfat" +First of all, the mount options for +.I fat +are recognized. +The +.B dotsOK +option is explicitly killed by +.IR vfat . +Furthermore, there are +.TP +.B uni_xlate +Translate unhandled Unicode characters to special escaped sequences. +This lets you backup and restore filenames that are created with any +Unicode characters. Without this option, a '?' is used when no +translation is possible. The escape character is ':' because it is +otherwise invalid on the vfat filesystem. The escape sequence +that gets used, where u is the Unicode character, +is: ':', (u & 0x3f), ((u>>6) & 0x3f), (u>>12). +.TP +.B posix +Allow two files with names that only differ in case. +This option is obsolete. +.TP +.B nonumtail +First try to make a short name without sequence number, +before trying +.IR name\s+3~\s0num.ext . +.TP +.B utf8 +UTF8 is the filesystem safe 8-bit encoding of Unicode that is used by the +console. It can be enabled for the filesystem with this option or disabled +with utf8=0, utf8=no or utf8=false. If `uni_xlate' gets set, UTF8 gets +disabled. +.TP +.BI shortname= mode +Defines the behavior for creation and display of filenames which fit into +8.3 characters. If a long name for a file exists, it will always be the +preferred one for display. There are four \fImode\fRs: +.RS +.TP +.B lower +Force the short name to lower case upon display; store a long name when +the short name is not all upper case. +.TP +.B win95 +Force the short name to upper case upon display; store a long name when +the short name is not all upper case. +.TP +.B winnt +Display the short name as is; store a long name when the short name is +not all lower case or all upper case. +.TP +.B mixed +Display the short name as is; store a long name when the short name is not +all upper case. This mode is the default since Linux 2.6.32. +.RE + +.SS "Mount options for usbfs" +.TP +\fBdevuid=\fP\,\fIuid\fP and \fBdevgid=\fP\,\fIgid\fP and \fBdevmode=\fP\,\fImode\fP +Set the owner and group and mode of the device files in the usbfs filesystem +(default: uid=gid=0, mode=0644). The mode is given in octal. +.TP +\fBbusuid=\fP\,\fIuid\fP and \fBbusgid=\fP\,\fIgid\fP and \fBbusmode=\fP\,\fImode\fP +Set the owner and group and mode of the bus directories in the usbfs +filesystem (default: uid=gid=0, mode=0555). The mode is given in octal. +.TP +\fBlistuid=\fP\,\fIuid\fP and \fBlistgid=\fP\,\fIgid\fP and \fBlistmode=\fP\,\fImode\fP +Set the owner and group and mode of the file +.I devices +(default: uid=gid=0, mode=0444). The mode is given in octal. + +.SH "THE LOOP DEVICE" +One further possible type is a mount via the loop device. For example, +the command +.RS +.sp +.B "mount /tmp/disk.img /mnt \-t vfat \-o loop=/dev/loop3" +.sp +.RE +will set up the loop device +.I /dev/loop3 +to correspond to the file +.IR /tmp/disk.img , +and then mount this device on +.IR /mnt . + +If no explicit loop device is mentioned +(but just an option `\fB\-o loop\fP' is given), then +.B mount +will try to find some unused loop device and use that, for example +.RS +.sp +.B "mount /tmp/disk.img /mnt \-o loop" +.sp +.RE +The mount command +.B automatically +creates a loop device from a regular file if a filesystem type is +not specified or the filesystem is known for libblkid, for example: +.RS +.sp +.B "mount /tmp/disk.img /mnt" +.sp +.B "mount \-t ext4 /tmp/disk.img /mnt" +.sp +.RE +This type of mount knows about three options, namely +.BR loop ", " offset " and " sizelimit , +that are really options to +.BR \%losetup (8). +(These options can be used in addition to those specific +to the filesystem type.) + +Since Linux 2.6.25 auto-destruction of loop devices is supported, +meaning that any loop device allocated by +.B mount +will be freed by +.B umount +independently of +.IR /etc/mtab . + +You can also free a loop device by hand, using +.BR "losetup \-d " or " umount \-d" . + +Since util-linux v2.29 mount command re-uses the loop device rather than +initialize a new device if the same backing file is already used for some loop +device with the same offset and sizelimit. This is necessary to avoid +a filesystem corruption. + +.SH RETURN CODES +.B mount +has the following return codes (the bits can be ORed): +.TP +.B 0 +success +.TP +.B 1 +incorrect invocation or permissions +.TP +.B 2 +system error (out of memory, cannot fork, no more loop devices) +.TP +.B 4 +internal +.B mount +bug +.TP +.B 8 +user interrupt +.TP +.B 16 +problems writing or locking /etc/mtab +.TP +.B 32 +mount failure +.TP +.B 64 +some mount succeeded +.RE + +The command \fBmount \-a\fR returns 0 (all succeeded), 32 (all failed), or 64 (some +failed, some succeeded). + +.SH "EXTERNAL HELPERS" +The syntax of external mount helpers is: +.sp +.in +4 +.BI /sbin/mount. suffix +.I spec dir +.RB [ \-sfnv ] +.RB [ \-N +.IR namespace ] +.RB [ \-o +.IR options ] +.RB [ \-t +.IR type \fB. subtype ] +.in +.sp +where the \fIsuffix\fR is the filesystem type and the \fB\-sfnvoN\fR options have +the same meaning as the normal mount options. The \fB\-t\fR option is used for +filesystems with subtypes support (for example +.BR "/sbin/mount.fuse \-t fuse.sshfs" ). + +The command \fBmount\fR does not pass the mount options +.BR unbindable , +.BR runbindable , +.BR private , +.BR rprivate , +.BR slave , +.BR rslave , +.BR shared , +.BR rshared , +.BR auto , +.BR noauto , +.BR comment , +.BR x-* , +.BR loop , +.B offset +and +.B sizelimit +to the mount.<suffix> helpers. All other options are used in a +comma-separated list as argument to the \fB\-o\fR option. + +.SH FILES +See also "\fBThe files /etc/fstab, /etc/mtab and /proc/mounts\fR" section above. +.TP 18n +.I /etc/fstab +filesystem table +.TP +.I /run/mount +libmount private runtime directory +.TP +.I /etc/mtab +table of mounted filesystems or symlink to /proc/mounts +.TP +.I /etc/mtab\s+3~\s0 +lock file (unused on systems with mtab symlink) +.TP +.I /etc/mtab.tmp +temporary file (unused on systems with mtab symlink) +.TP +.I /etc/filesystems +a list of filesystem types to try +.SH ENVIRONMENT +.IP LIBMOUNT_FSTAB=<path> +overrides the default location of the fstab file (ignored for suid) +.IP LIBMOUNT_MTAB=<path> +overrides the default location of the mtab file (ignored for suid) +.IP LIBMOUNT_DEBUG=all +enables libmount debug output +.IP LIBBLKID_DEBUG=all +enables libblkid debug output +.IP LOOPDEV_DEBUG=all +enables loop device setup debug output +.SH "SEE ALSO" +.na +.BR mount (2), +.BR umount (2), +.BR umount (8), +.BR fstab (5), +.BR nfs (5), +.BR xfs (5), +.BR e2label (8), +.BR findmnt (8), +.BR losetup (8), +.BR mke2fs (8), +.BR mountd (8), +.BR nfsd (8), +.BR swapon (8), +.BR tune2fs (8), +.BR xfs_admin (8) +.ad +.SH BUGS +It is possible for a corrupted filesystem to cause a crash. +.PP +Some Linux filesystems don't support +.BR "\-o sync " nor " \-o dirsync" +(the ext2, ext3, ext4, fat and vfat filesystems +.I do +support synchronous updates (a la BSD) when mounted with the +.B sync +option). +.PP +The +.B "\-o remount" +may not be able to change mount parameters (all +.IR ext2fs -specific +parameters, except +.BR sb , +are changeable with a remount, for example, but you can't change +.B gid +or +.B umask +for the +.IR fatfs ). +.PP +It is possible that the files +.I /etc/mtab +and +.I /proc/mounts +don't match on systems with a regular mtab file. The first file is based only on +the mount command options, but the content of the second file also depends on +the kernel and others settings (e.g.\& on a remote NFS server -- in certain cases +the mount command may report unreliable information about an NFS mount point +and the /proc/mounts file usually contains more reliable information.) This is +another reason to replace the mtab file with a symlink to the +.I /proc/mounts +file. +.PP +Checking files on NFS filesystems referenced by file descriptors (i.e.\& the +.B fcntl +and +.B ioctl +families of functions) may lead to inconsistent results due to the lack of +a consistency check in the kernel even if noac is used. +.PP +The +.B loop +option with the +.B offset +or +.B sizelimit +options used may fail when using older kernels if the +.B mount +command can't confirm that the size of the block device has been configured +as requested. This situation can be worked around by using +the +.B losetup +command manually before calling +.B mount +with the configured loop device. +.SH HISTORY +A +.B mount +command existed in Version 5 AT&T UNIX. +.SH AUTHORS +.nf +Karel Zak <kzak@redhat.com> +.fi +.SH AVAILABILITY +The mount command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/mount.c b/sys-utils/mount.c new file mode 100644 index 0000000..5e139e8 --- /dev/null +++ b/sys-utils/mount.c @@ -0,0 +1,918 @@ +/* + * mount(8) -- mount a filesystem + * + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Written by Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <getopt.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <stdarg.h> +#include <libmount.h> +#include <ctype.h> + +#include "nls.h" +#include "c.h" +#include "env.h" +#include "strutils.h" +#include "closestream.h" +#include "canonicalize.h" + +#define XALLOC_EXIT_CODE MNT_EX_SYSERR +#include "xalloc.h" + +#define OPTUTILS_EXIT_CODE MNT_EX_USAGE +#include "optutils.h" + +/*** TODO: DOCS: + * + * --options-mode={ignore,append,prepend,replace} MNT_OMODE_{IGNORE, ...} + * --options-source={fstab,mtab,disable} MNT_OMODE_{FSTAB,MTAB,NOTAB} + * --options-source-force MNT_OMODE_FORCE + */ + +static int mk_exit_code(struct libmnt_context *cxt, int rc); + +static void __attribute__((__noreturn__)) exit_non_root(const char *option) +{ + const uid_t ruid = getuid(); + const uid_t euid = geteuid(); + + if (ruid == 0 && euid != 0) { + /* user is root, but setuid to non-root */ + if (option) + errx(MNT_EX_USAGE, _("only root can use \"--%s\" option " + "(effective UID is %u)"), + option, euid); + errx(MNT_EX_USAGE, _("only root can do that " + "(effective UID is %u)"), euid); + } + if (option) + errx(MNT_EX_USAGE, _("only root can use \"--%s\" option"), option); + errx(MNT_EX_USAGE, _("only root can do that")); +} + +static void __attribute__((__noreturn__)) print_version(void) +{ + const char *ver = NULL; + const char **features = NULL, **p; + + mnt_get_library_version(&ver); + mnt_get_library_features(&features); + + printf(_("%s from %s (libmount %s"), + program_invocation_short_name, + PACKAGE_STRING, + ver); + p = features; + while (p && *p) { + fputs(p == features ? ": " : ", ", stdout); + fputs(*p++, stdout); + } + fputs(")\n", stdout); + exit(MNT_EX_SUCCESS); +} + +static int table_parser_errcb(struct libmnt_table *tb __attribute__((__unused__)), + const char *filename, int line) +{ + if (filename) + warnx(_("%s: parse error at line %d -- ignored"), filename, line); + return 1; +} + +/* + * Replace control chars with '?' to be compatible with coreutils. For more + * robust solution use findmnt(1) where we use \x?? hex encoding. + */ +static void safe_fputs(const char *data) +{ + const char *p; + + for (p = data; p && *p; p++) { + if (iscntrl((unsigned char) *p)) + fputc('?', stdout); + else + fputc(*p, stdout); + } +} + +static void print_all(struct libmnt_context *cxt, char *pattern, int show_label) +{ + struct libmnt_table *tb; + struct libmnt_iter *itr = NULL; + struct libmnt_fs *fs; + struct libmnt_cache *cache = NULL; + + if (mnt_context_get_mtab(cxt, &tb)) + err(MNT_EX_SYSERR, _("failed to read mtab")); + + itr = mnt_new_iter(MNT_ITER_FORWARD); + if (!itr) + err(MNT_EX_SYSERR, _("failed to initialize libmount iterator")); + if (show_label) + cache = mnt_new_cache(); + + while (mnt_table_next_fs(tb, itr, &fs) == 0) { + const char *type = mnt_fs_get_fstype(fs); + const char *src = mnt_fs_get_source(fs); + const char *optstr = mnt_fs_get_options(fs); + char *xsrc = NULL; + + if (type && pattern && !mnt_match_fstype(type, pattern)) + continue; + + if (!mnt_fs_is_pseudofs(fs)) + xsrc = mnt_pretty_path(src, cache); + printf ("%s on ", xsrc ? xsrc : src); + safe_fputs(mnt_fs_get_target(fs)); + + if (type) + printf (" type %s", type); + if (optstr) + printf (" (%s)", optstr); + if (show_label && src) { + char *lb = mnt_cache_find_tag_value(cache, src, "LABEL"); + if (lb) + printf (" [%s]", lb); + } + fputc('\n', stdout); + free(xsrc); + } + + mnt_unref_cache(cache); + mnt_free_iter(itr); +} + +/* + * mount -a [-F] + */ +static int mount_all(struct libmnt_context *cxt) +{ + struct libmnt_iter *itr; + struct libmnt_fs *fs; + int mntrc, ignored, rc = MNT_EX_SUCCESS; + + int nsucc = 0, nerrs = 0; + + itr = mnt_new_iter(MNT_ITER_FORWARD); + if (!itr) { + warn(_("failed to initialize libmount iterator")); + return MNT_EX_SYSERR; + } + + while (mnt_context_next_mount(cxt, itr, &fs, &mntrc, &ignored) == 0) { + + const char *tgt = mnt_fs_get_target(fs); + + if (ignored) { + if (mnt_context_is_verbose(cxt)) + printf(ignored == 1 ? _("%-25s: ignored\n") : + _("%-25s: already mounted\n"), + tgt); + } else if (mnt_context_is_fork(cxt)) { + if (mnt_context_is_verbose(cxt)) + printf("%-25s: mount successfully forked\n", tgt); + } else { + if (mk_exit_code(cxt, mntrc) == MNT_EX_SUCCESS) { + nsucc++; + + /* Note that MNT_EX_SUCCESS return code does + * not mean that FS has been really mounted + * (e.g. nofail option) */ + if (mnt_context_get_status(cxt) + && mnt_context_is_verbose(cxt)) + printf("%-25s: successfully mounted\n", tgt); + } else + nerrs++; + } + } + + if (mnt_context_is_parent(cxt)) { + /* wait for mount --fork children */ + int nchildren = 0; + + nerrs = 0, nsucc = 0; + + rc = mnt_context_wait_for_children(cxt, &nchildren, &nerrs); + if (!rc && nchildren) + nsucc = nchildren - nerrs; + } + + if (nerrs == 0) + rc = MNT_EX_SUCCESS; /* all success */ + else if (nsucc == 0) + rc = MNT_EX_FAIL; /* all failed */ + else + rc = MNT_EX_SOMEOK; /* some success, some failed */ + + mnt_free_iter(itr); + return rc; +} + +static void success_message(struct libmnt_context *cxt) +{ + unsigned long mflags = 0; + const char *tgt, *src, *pr = program_invocation_short_name; + + if (mnt_context_helper_executed(cxt) + || mnt_context_get_status(cxt) != 1) + return; + + mnt_context_get_mflags(cxt, &mflags); + tgt = mnt_context_get_target(cxt); + src = mnt_context_get_source(cxt); + + if (mflags & MS_MOVE) + printf(_("%s: %s moved to %s.\n"), pr, src, tgt); + else if (mflags & MS_BIND) + printf(_("%s: %s bound on %s.\n"), pr, src, tgt); + else if (mflags & MS_PROPAGATION) { + if (src && strcmp(src, "none") != 0 && tgt) + printf(_("%s: %s mounted on %s.\n"), pr, src, tgt); + + printf(_("%s: %s propagation flags changed.\n"), pr, tgt); + } else + printf(_("%s: %s mounted on %s.\n"), pr, src, tgt); +} + +#if defined(HAVE_LIBSELINUX) && defined(HAVE_SECURITY_GET_INITIAL_CONTEXT) +#include <selinux/selinux.h> +#include <selinux/context.h> + +static void selinux_warning(struct libmnt_context *cxt, const char *tgt) +{ + + if (tgt && mnt_context_is_verbose(cxt) && is_selinux_enabled() > 0) { + security_context_t raw = NULL, def = NULL; + + if (getfilecon(tgt, &raw) > 0 + && security_get_initial_context("file", &def) == 0) { + + if (!selinux_file_context_cmp(raw, def)) + printf(_( + "mount: %s does not contain SELinux labels.\n" + " You just mounted an file system that supports labels which does not\n" + " contain labels, onto an SELinux box. It is likely that confined\n" + " applications will generate AVC messages and not be allowed access to\n" + " this file system. For more details see restorecon(8) and mount(8).\n"), + tgt); + } + freecon(raw); + freecon(def); + } +} +#else +# define selinux_warning(_x, _y) +#endif + +/* + * Returns exit status (MNT_EX_*) and/or prints error message. + */ +static int mk_exit_code(struct libmnt_context *cxt, int rc) +{ + const char *tgt; + char buf[BUFSIZ] = { 0 }; + + rc = mnt_context_get_excode(cxt, rc, buf, sizeof(buf)); + tgt = mnt_context_get_target(cxt); + + if (*buf) { + const char *spec = tgt; + if (!spec) + spec = mnt_context_get_source(cxt); + if (!spec) + spec = "???"; + warnx("%s: %s.", spec, buf); + } + + if (rc == MNT_EX_SUCCESS && mnt_context_get_status(cxt) == 1) { + selinux_warning(cxt, tgt); + } + return rc; +} + +static struct libmnt_table *append_fstab(struct libmnt_context *cxt, + struct libmnt_table *fstab, + const char *path) +{ + + if (!fstab) { + fstab = mnt_new_table(); + if (!fstab) + err(MNT_EX_SYSERR, _("failed to initialize libmount table")); + + mnt_table_set_parser_errcb(fstab, table_parser_errcb); + mnt_context_set_fstab(cxt, fstab); + + mnt_unref_table(fstab); /* reference is handled by @cxt now */ + } + + if (mnt_table_parse_fstab(fstab, path)) + errx(MNT_EX_USAGE,_("%s: failed to parse"), path); + + return fstab; +} + +/* + * Check source and target paths -- non-root user should not be able to + * resolve paths which are unreadable for him. + */ +static void sanitize_paths(struct libmnt_context *cxt) +{ + const char *p; + struct libmnt_fs *fs = mnt_context_get_fs(cxt); + + if (!fs) + return; + + p = mnt_fs_get_target(fs); + if (p) { + char *np = canonicalize_path_restricted(p); + if (!np) + err(MNT_EX_USAGE, "%s", p); + mnt_fs_set_target(fs, np); + free(np); + } + + p = mnt_fs_get_srcpath(fs); + if (p) { + char *np = canonicalize_path_restricted(p); + if (!np) + err(MNT_EX_USAGE, "%s", p); + mnt_fs_set_source(fs, np); + free(np); + } +} + +static void append_option(struct libmnt_context *cxt, const char *opt) +{ + if (opt && (*opt == '=' || *opt == '\'' || *opt == '\"' || isblank(*opt))) + errx(MNT_EX_USAGE, _("unsupported option format: %s"), opt); + if (mnt_context_append_options(cxt, opt)) + err(MNT_EX_SYSERR, _("failed to append option '%s'"), opt); +} + +static int has_remount_flag(struct libmnt_context *cxt) +{ + unsigned long mflags = 0; + + if (mnt_context_get_mflags(cxt, &mflags)) + return 0; + + return mflags & MS_REMOUNT; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _( + " %1$s [-lhV]\n" + " %1$s -a [options]\n" + " %1$s [options] [--source] <source> | [--target] <directory>\n" + " %1$s [options] <source> <directory>\n" + " %1$s <operation> <mountpoint> [<target>]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Mount a filesystem.\n"), out); + + fputs(USAGE_OPTIONS, out); + fprintf(out, _( + " -a, --all mount all filesystems mentioned in fstab\n" + " -c, --no-canonicalize don't canonicalize paths\n" + " -f, --fake dry run; skip the mount(2) syscall\n" + " -F, --fork fork off for each device (use with -a)\n" + " -T, --fstab <path> alternative file to /etc/fstab\n")); + fprintf(out, _( + " -i, --internal-only don't call the mount.<type> helpers\n")); + fprintf(out, _( + " -l, --show-labels show also filesystem labels\n")); + fprintf(out, _( + " -n, --no-mtab don't write to /etc/mtab\n")); + fprintf(out, _( + " --options-mode <mode>\n" + " what to do with options loaded from fstab\n" + " --options-source <source>\n" + " mount options source\n" + " --options-source-force\n" + " force use of options from fstab/mtab\n")); + fprintf(out, _( + " -o, --options <list> comma-separated list of mount options\n" + " -O, --test-opts <list> limit the set of filesystems (use with -a)\n" + " -r, --read-only mount the filesystem read-only (same as -o ro)\n" + " -t, --types <list> limit the set of filesystem types\n")); + fprintf(out, _( + " --source <src> explicitly specifies source (path, label, uuid)\n" + " --target <target> explicitly specifies mountpoint\n")); + fprintf(out, _( + " -v, --verbose say what is being done\n")); + fprintf(out, _( + " -w, --rw, --read-write mount the filesystem read-write (default)\n")); + fprintf(out, _( + " -N, --namespace <ns> perform mount in another namespace\n")); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(25)); + + fprintf(out, _( + "\nSource:\n" + " -L, --label <label> synonym for LABEL=<label>\n" + " -U, --uuid <uuid> synonym for UUID=<uuid>\n" + " LABEL=<label> specifies device by filesystem label\n" + " UUID=<uuid> specifies device by filesystem UUID\n" + " PARTLABEL=<label> specifies device by partition label\n" + " PARTUUID=<uuid> specifies device by partition UUID\n")); + + fprintf(out, _( + " <device> specifies device by path\n" + " <directory> mountpoint for bind mounts (see --bind/rbind)\n" + " <file> regular file for loopdev setup\n")); + + fprintf(out, _( + "\nOperations:\n" + " -B, --bind mount a subtree somewhere else (same as -o bind)\n" + " -M, --move move a subtree to some other place\n" + " -R, --rbind mount a subtree and all submounts somewhere else\n")); + fprintf(out, _( + " --make-shared mark a subtree as shared\n" + " --make-slave mark a subtree as slave\n" + " --make-private mark a subtree as private\n" + " --make-unbindable mark a subtree as unbindable\n")); + fprintf(out, _( + " --make-rshared recursively mark a whole subtree as shared\n" + " --make-rslave recursively mark a whole subtree as slave\n" + " --make-rprivate recursively mark a whole subtree as private\n" + " --make-runbindable recursively mark a whole subtree as unbindable\n")); + + printf(USAGE_MAN_TAIL("mount(8)")); + + exit(MNT_EX_SUCCESS); +} + +struct flag_str { + int value; + char *str; +}; + +static int omode2mask(const char *str) +{ + size_t i; + + static const struct flag_str flags[] = { + { MNT_OMODE_IGNORE, "ignore" }, + { MNT_OMODE_APPEND, "append" }, + { MNT_OMODE_PREPEND, "prepend" }, + { MNT_OMODE_REPLACE, "replace" }, + }; + + for (i = 0; i < ARRAY_SIZE(flags); i++) { + if (!strcmp(str, flags[i].str)) + return flags[i].value; + } + return -EINVAL; +} + +static long osrc2mask(const char *str, size_t len) +{ + size_t i; + + static const struct flag_str flags[] = { + { MNT_OMODE_FSTAB, "fstab" }, + { MNT_OMODE_MTAB, "mtab" }, + { MNT_OMODE_NOTAB, "disable" }, + }; + + for (i = 0; i < ARRAY_SIZE(flags); i++) { + if (!strncmp(str, flags[i].str, len) && !flags[i].str[len]) + return flags[i].value; + } + return -EINVAL; +} + +static pid_t parse_pid(const char *str) +{ + char *end; + pid_t ret; + + errno = 0; + ret = strtoul(str, &end, 10); + + if (ret < 0 || errno || end == str || (end && *end)) + return 0; + return ret; +} + +int main(int argc, char **argv) +{ + int c, rc = MNT_EX_SUCCESS, all = 0, show_labels = 0; + struct libmnt_context *cxt; + struct libmnt_table *fstab = NULL; + char *srcbuf = NULL; + char *types = NULL; + int oper = 0, is_move = 0; + int propa = 0; + int optmode = 0, optmode_mode = 0, optmode_src = 0; + + enum { + MOUNT_OPT_SHARED = CHAR_MAX + 1, + MOUNT_OPT_SLAVE, + MOUNT_OPT_PRIVATE, + MOUNT_OPT_UNBINDABLE, + MOUNT_OPT_RSHARED, + MOUNT_OPT_RSLAVE, + MOUNT_OPT_RPRIVATE, + MOUNT_OPT_RUNBINDABLE, + MOUNT_OPT_TARGET, + MOUNT_OPT_SOURCE, + MOUNT_OPT_OPTMODE, + MOUNT_OPT_OPTSRC, + MOUNT_OPT_OPTSRC_FORCE + }; + + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "fake", no_argument, NULL, 'f' }, + { "fstab", required_argument, NULL, 'T' }, + { "fork", no_argument, NULL, 'F' }, + { "help", no_argument, NULL, 'h' }, + { "no-mtab", no_argument, NULL, 'n' }, + { "read-only", no_argument, NULL, 'r' }, + { "ro", no_argument, NULL, 'r' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { "read-write", no_argument, NULL, 'w' }, + { "rw", no_argument, NULL, 'w' }, + { "options", required_argument, NULL, 'o' }, + { "test-opts", required_argument, NULL, 'O' }, + { "types", required_argument, NULL, 't' }, + { "uuid", required_argument, NULL, 'U' }, + { "label", required_argument, NULL, 'L' }, + { "bind", no_argument, NULL, 'B' }, + { "move", no_argument, NULL, 'M' }, + { "rbind", no_argument, NULL, 'R' }, + { "make-shared", no_argument, NULL, MOUNT_OPT_SHARED }, + { "make-slave", no_argument, NULL, MOUNT_OPT_SLAVE }, + { "make-private", no_argument, NULL, MOUNT_OPT_PRIVATE }, + { "make-unbindable", no_argument, NULL, MOUNT_OPT_UNBINDABLE }, + { "make-rshared", no_argument, NULL, MOUNT_OPT_RSHARED }, + { "make-rslave", no_argument, NULL, MOUNT_OPT_RSLAVE }, + { "make-rprivate", no_argument, NULL, MOUNT_OPT_RPRIVATE }, + { "make-runbindable", no_argument, NULL, MOUNT_OPT_RUNBINDABLE }, + { "no-canonicalize", no_argument, NULL, 'c' }, + { "internal-only", no_argument, NULL, 'i' }, + { "show-labels", no_argument, NULL, 'l' }, + { "target", required_argument, NULL, MOUNT_OPT_TARGET }, + { "source", required_argument, NULL, MOUNT_OPT_SOURCE }, + { "options-mode", required_argument, NULL, MOUNT_OPT_OPTMODE }, + { "options-source", required_argument, NULL, MOUNT_OPT_OPTSRC }, + { "options-source-force", no_argument, NULL, MOUNT_OPT_OPTSRC_FORCE}, + { "namespace", required_argument, NULL, 'N' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'B','M','R' }, /* bind,move,rbind */ + { 'L','U', MOUNT_OPT_SOURCE }, /* label,uuid,source */ + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + sanitize_env(); + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + strutils_set_exitcode(MNT_EX_USAGE); + + mnt_init_debug(0); + cxt = mnt_new_context(); + if (!cxt) + err(MNT_EX_SYSERR, _("libmount context allocation failed")); + + mnt_context_set_tables_errcb(cxt, table_parser_errcb); + + while ((c = getopt_long(argc, argv, "aBcfFhilL:Mno:O:rRsU:vVwt:T:N:", + longopts, NULL)) != -1) { + + /* only few options are allowed for non-root users */ + if (mnt_context_is_restricted(cxt) && + !strchr("hlLUVvrist", c) && + c != MOUNT_OPT_TARGET && + c != MOUNT_OPT_SOURCE) + exit_non_root(option_to_longopt(c, longopts)); + + err_exclusive_options(c, longopts, excl, excl_st); + + switch(c) { + case 'a': + all = 1; + break; + case 'c': + mnt_context_disable_canonicalize(cxt, TRUE); + break; + case 'f': + mnt_context_enable_fake(cxt, TRUE); + break; + case 'F': + mnt_context_enable_fork(cxt, TRUE); + break; + case 'h': + usage(); + break; + case 'i': + mnt_context_disable_helpers(cxt, TRUE); + break; + case 'n': + mnt_context_disable_mtab(cxt, TRUE); + break; + case 'r': + append_option(cxt, "ro"); + mnt_context_enable_rwonly_mount(cxt, FALSE); + break; + case 'v': + mnt_context_enable_verbose(cxt, TRUE); + break; + case 'V': + print_version(); + break; + case 'w': + append_option(cxt, "rw"); + mnt_context_enable_rwonly_mount(cxt, TRUE); + break; + case 'o': + append_option(cxt, optarg); + break; + case 'O': + if (mnt_context_set_options_pattern(cxt, optarg)) + err(MNT_EX_SYSERR, _("failed to set options pattern")); + break; + case 'L': + xasprintf(&srcbuf, "LABEL=\"%s\"", optarg); + mnt_context_disable_swapmatch(cxt, 1); + mnt_context_set_source(cxt, srcbuf); + free(srcbuf); + break; + case 'U': + xasprintf(&srcbuf, "UUID=\"%s\"", optarg); + mnt_context_disable_swapmatch(cxt, 1); + mnt_context_set_source(cxt, srcbuf); + free(srcbuf); + break; + case 'l': + show_labels = 1; + break; + case 't': + types = optarg; + break; + case 'T': + fstab = append_fstab(cxt, fstab, optarg); + break; + case 's': + mnt_context_enable_sloppy(cxt, TRUE); + break; + case 'B': + oper = 1; + append_option(cxt, "bind"); + break; + case 'M': + oper = 1; + is_move = 1; + break; + case 'R': + oper = 1; + append_option(cxt, "rbind"); + break; + case 'N': + { + char path[PATH_MAX]; + pid_t pid = parse_pid(optarg); + + if (pid) + snprintf(path, sizeof(path), "/proc/%i/ns/mnt", pid); + + if (mnt_context_set_target_ns(cxt, pid ? path : optarg)) + err(MNT_EX_SYSERR, _("failed to set target namespace to %s"), pid ? path : optarg); + break; + } + case MOUNT_OPT_SHARED: + append_option(cxt, "shared"); + propa = 1; + break; + case MOUNT_OPT_SLAVE: + append_option(cxt, "slave"); + propa = 1; + break; + case MOUNT_OPT_PRIVATE: + append_option(cxt, "private"); + propa = 1; + break; + case MOUNT_OPT_UNBINDABLE: + append_option(cxt, "unbindable"); + propa = 1; + break; + case MOUNT_OPT_RSHARED: + append_option(cxt, "rshared"); + propa = 1; + break; + case MOUNT_OPT_RSLAVE: + append_option(cxt, "rslave"); + propa = 1; + break; + case MOUNT_OPT_RPRIVATE: + append_option(cxt, "rprivate"); + propa = 1; + break; + case MOUNT_OPT_RUNBINDABLE: + append_option(cxt, "runbindable"); + propa = 1; + break; + case MOUNT_OPT_TARGET: + mnt_context_disable_swapmatch(cxt, 1); + mnt_context_set_target(cxt, optarg); + break; + case MOUNT_OPT_SOURCE: + mnt_context_disable_swapmatch(cxt, 1); + mnt_context_set_source(cxt, optarg); + break; + case MOUNT_OPT_OPTMODE: + optmode_mode = omode2mask(optarg); + if (optmode_mode == -EINVAL) { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + } + break; + case MOUNT_OPT_OPTSRC: + { + unsigned long tmp = 0; + if (string_to_bitmask(optarg, &tmp, osrc2mask)) { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + } + optmode_src = tmp; + break; + } + case MOUNT_OPT_OPTSRC_FORCE: + optmode |= MNT_OMODE_FORCE; + break; + default: + errtryhelp(MNT_EX_USAGE); + } + } + + argc -= optind; + argv += optind; + + optmode |= optmode_mode | optmode_src; + if (optmode) { + if (!optmode_mode) + optmode |= MNT_OMODE_PREPEND; + if (!optmode_src) + optmode |= MNT_OMODE_FSTAB | MNT_OMODE_MTAB; + mnt_context_set_optsmode(cxt, optmode); + } + + if (fstab && !mnt_context_is_nocanonicalize(cxt)) { + /* + * We have external (context independent) fstab instance, let's + * make a connection between the fstab and the canonicalization + * cache. + */ + mnt_table_set_cache(fstab, mnt_context_get_cache(cxt)); + } + + if (!mnt_context_get_source(cxt) && + !mnt_context_get_target(cxt) && + !argc && + !all) { + if (oper || mnt_context_get_options(cxt)) { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + } + print_all(cxt, types, show_labels); + goto done; + } + + /* Non-root users are allowed to use -t to print_all(), + but not to mount */ + if (mnt_context_is_restricted(cxt) && types) + exit_non_root("types"); + + if (oper && (types || all || mnt_context_get_source(cxt))) { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + } + + if (types && (all || strchr(types, ',') || + strncmp(types, "no", 2) == 0)) + mnt_context_set_fstype_pattern(cxt, types); + else if (types) + mnt_context_set_fstype(cxt, types); + + if (all) { + /* + * A) Mount all + */ + rc = mount_all(cxt); + goto done; + + } else if (argc == 0 && (mnt_context_get_source(cxt) || + mnt_context_get_target(cxt))) { + /* + * B) mount -L|-U|--source|--target + * + * non-root may specify source *or* target, but not both + */ + if (mnt_context_is_restricted(cxt) && + mnt_context_get_source(cxt) && + mnt_context_get_target(cxt)) + exit_non_root(NULL); + + } else if (argc == 1 && (!mnt_context_get_source(cxt) || + !mnt_context_get_target(cxt))) { + /* + * C) mount [-L|-U|--source] <target> + * mount [--target <dir>] <source> + * mount <source|target> + * + * non-root may specify source *or* target, but not both + * + * It does not matter for libmount if we set source or target + * here (the library is able to swap it), but it matters for + * sanitize_paths(). + */ + int istag = mnt_tag_is_valid(argv[0]); + + if (istag && mnt_context_get_source(cxt)) + /* -L, -U or --source together with LABEL= or UUID= */ + errx(MNT_EX_USAGE, _("source specified more than once")); + else if (istag || mnt_context_get_target(cxt)) + mnt_context_set_source(cxt, argv[0]); + else + mnt_context_set_target(cxt, argv[0]); + + if (mnt_context_is_restricted(cxt) && + mnt_context_get_source(cxt) && + mnt_context_get_target(cxt)) + exit_non_root(NULL); + + } else if (argc == 2 && !mnt_context_get_source(cxt) + && !mnt_context_get_target(cxt)) { + /* + * D) mount <source> <target> + */ + if (mnt_context_is_restricted(cxt)) + exit_non_root(NULL); + + mnt_context_set_source(cxt, argv[0]); + mnt_context_set_target(cxt, argv[1]); + + } else { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + } + + if (mnt_context_is_restricted(cxt)) + sanitize_paths(cxt); + + if (is_move) + /* "move" as option string is not supported by libmount */ + mnt_context_set_mflags(cxt, MS_MOVE); + + if ((oper && !has_remount_flag(cxt)) || propa) + /* For --make-* or --bind is fstab/mtab unnecessary */ + mnt_context_set_optsmode(cxt, MNT_OMODE_NOTAB); + + rc = mnt_context_mount(cxt); + rc = mk_exit_code(cxt, rc); + + if (rc == MNT_EX_SUCCESS && mnt_context_is_verbose(cxt)) + success_message(cxt); +done: + mnt_free_context(cxt); + return rc; +} + diff --git a/sys-utils/mountpoint.1 b/sys-utils/mountpoint.1 new file mode 100644 index 0000000..afc469e --- /dev/null +++ b/sys-utils/mountpoint.1 @@ -0,0 +1,58 @@ +.TH MOUNTPOINT 1 "July 2014" "util-linux" "User Commands" +.SH NAME +mountpoint \- see if a directory or file is a mountpoint +.SH SYNOPSIS +.B mountpoint +.RB [ \-d | \-q ] +.I directory +| +.I file +.sp +.B mountpoint +.B \-x +.I device + +.SH DESCRIPTION +.B mountpoint +checks whether the given +.I directory +or +.I file +is mentioned in the /proc/self/mountinfo file. +.SH OPTIONS +.TP +.BR \-d , " \-\-fs\-devno" +Show the major/minor numbers of the device that is mounted on the given +directory. +.TP +.BR \-q , " \-\-quiet" +Be quiet - don't print anything. +.TP +.BR \-x , " \-\-devno" +Show the major/minor numbers of the given blockdevice on standard output. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH EXIT STATUS +Zero if the directory or file is a mountpoint, non-zero if not. +.SH AUTHOR +.PP +Karel Zak <kzak@redhat.com> +.SH ENVIRONMENT +.IP LIBMOUNT_DEBUG=all +enables libmount debug output. +.SH NOTES +.PP +The util-linux +.B mountpoint +implementation was written from scratch for libmount. The original version +for sysvinit suite was written by Miquel van Smoorenburg. + +.SH SEE ALSO +.BR mount (8) +.SH AVAILABILITY +The mountpoint command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/mountpoint.c b/sys-utils/mountpoint.c new file mode 100644 index 0000000..00a74da --- /dev/null +++ b/sys-utils/mountpoint.c @@ -0,0 +1,203 @@ +/* + * mountpoint(1) - see if a directory is a mountpoint + * + * This is libmount based reimplementation of the mountpoint(1) + * from sysvinit project. + * + * + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Written by Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <getopt.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include <libmount.h> + +#include "nls.h" +#include "xalloc.h" +#include "c.h" +#include "closestream.h" +#include "pathnames.h" + +struct mountpoint_control { + char *path; + dev_t dev; + struct stat st; + unsigned int + dev_devno:1, + fs_devno:1, + quiet:1; +}; + +static int dir_to_device(struct mountpoint_control *ctl) +{ + struct libmnt_table *tb = mnt_new_table_from_file(_PATH_PROC_MOUNTINFO); + struct libmnt_fs *fs; + struct libmnt_cache *cache; + int rc = -1; + + if (!tb) { + /* + * Fallback. Traditional way to detect mountpoints. This way + * is independent on /proc, but not able to detect bind mounts. + */ + struct stat pst; + char buf[PATH_MAX], *cn; + int len; + + cn = mnt_resolve_path(ctl->path, NULL); /* canonicalize */ + + len = snprintf(buf, sizeof(buf), "%s/..", cn ? cn : ctl->path); + free(cn); + + if (len < 0 || (size_t) len >= sizeof(buf)) + return -1; + if (stat(buf, &pst) !=0) + return -1; + + if (ctl->st.st_dev != pst.st_dev || ctl->st.st_ino == pst.st_ino) { + ctl->dev = ctl->st.st_dev; + return 0; + } + + return -1; + } + + /* to canonicalize all necessary paths */ + cache = mnt_new_cache(); + mnt_table_set_cache(tb, cache); + mnt_unref_cache(cache); + + fs = mnt_table_find_target(tb, ctl->path, MNT_ITER_BACKWARD); + if (fs && mnt_fs_get_target(fs)) { + ctl->dev = mnt_fs_get_devno(fs); + rc = 0; + } + + mnt_unref_table(tb); + return rc; +} + +static int print_devno(const struct mountpoint_control *ctl) +{ + if (!S_ISBLK(ctl->st.st_mode)) { + if (!ctl->quiet) + warnx(_("%s: not a block device"), ctl->path); + return -1; + } + printf("%u:%u\n", major(ctl->st.st_rdev), minor(ctl->st.st_rdev)); + return 0; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %1$s [-qd] /path/to/directory\n" + " %1$s -x /dev/device\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Check whether a directory or file is a mountpoint.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -q, --quiet quiet mode - don't print anything\n" + " -d, --fs-devno print maj:min device number of the filesystem\n" + " -x, --devno print maj:min device number of the block device\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(20)); + printf(USAGE_MAN_TAIL("mountpoint(1)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int c; + struct mountpoint_control ctl = { NULL }; + + static const struct option longopts[] = { + { "quiet", no_argument, NULL, 'q' }, + { "fs-devno", no_argument, NULL, 'd' }, + { "devno", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + mnt_init_debug(0); + + while ((c = getopt_long(argc, argv, "qdxhV", longopts, NULL)) != -1) { + + switch(c) { + case 'q': + ctl.quiet = 1; + break; + case 'd': + ctl.fs_devno = 1; + break; + case 'x': + ctl.dev_devno = 1; + break; + case 'h': + usage(); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optind + 1 != argc) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + ctl.path = argv[optind]; + + if (stat(ctl.path, &ctl.st)) { + if (!ctl.quiet) + err(EXIT_FAILURE, "%s", ctl.path); + return EXIT_FAILURE; + } + if (ctl.dev_devno) + return print_devno(&ctl) ? EXIT_FAILURE : EXIT_SUCCESS; + if (dir_to_device(&ctl)) { + if (!ctl.quiet) + printf(_("%s is not a mountpoint\n"), ctl.path); + return EXIT_FAILURE; + } + if (ctl.fs_devno) + printf("%u:%u\n", major(ctl.dev), minor(ctl.dev)); + else if (!ctl.quiet) + printf(_("%s is a mountpoint\n"), ctl.path); + return EXIT_SUCCESS; +} diff --git a/sys-utils/nsenter.1 b/sys-utils/nsenter.1 new file mode 100644 index 0000000..aacae53 --- /dev/null +++ b/sys-utils/nsenter.1 @@ -0,0 +1,269 @@ +.TH NSENTER 1 "June 2013" "util-linux" "User Commands" +.SH NAME +nsenter \- run program with namespaces of other processes +.SH SYNOPSIS +.B nsenter +[options] +.RI [ program +.RI [ arguments ]] +.SH DESCRIPTION +Enters the namespaces of one or more other processes and then executes the specified +\fIprogram\fP. If \fIprogram\fP is not given, then ``${SHELL}'' is run (default: /bin\:/sh). +.PP +Enterable namespaces are: +.TP +.B mount namespace +Mounting and unmounting filesystems will not affect the rest of the system, +except for filesystems which are explicitly marked as shared (with +\fBmount --make-\:shared\fP; see \fI/proc\:/self\:/mountinfo\fP for the +\fBshared\fP flag). +For further details, see +.BR mount_namespaces (7) +and the discussion of the +.B CLONE_NEWNS +flag in +.BR clone (2). +.TP +.B UTS namespace +Setting hostname or domainname will not affect the rest of the system. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWUTS +flag in +.BR clone (2). +.TP +.B IPC namespace +The process will have an independent namespace for POSIX message queues +as well as System V message queues, +semaphore sets and shared memory segments. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWIPC +flag in +.BR clone (2). +.TP +.B network namespace +The process will have independent IPv4 and IPv6 stacks, IP routing tables, +firewall rules, the +.I /proc\:/net +and +.I /sys\:/class\:/net +directory trees, sockets, etc. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWNET +flag in +.BR clone (2). +.TP +.B PID namespace +Children will have a set of PID to process mappings separate from the +.B nsenter +process +For further details, see +.BR pid_namespaces (7) +and +the discussion of the +.B CLONE_NEWPID +flag in +.B nsenter +will fork by default if changing the PID namespace, so that the new program +and its children share the same PID namespace and are visible to each other. +If \fB\-\-no\-fork\fP is used, the new program will be exec'ed without forking. +.TP +.B user namespace +The process will have a distinct set of UIDs, GIDs and capabilities. +For further details, see +.BR user_namespaces (7) +and the discussion of the +.B CLONE_NEWUSER +flag in +.BR clone (2). +.TP +.B cgroup namespace +The process will have a virtualized view of \fI/proc\:/self\:/cgroup\fP, and new +cgroup mounts will be rooted at the namespace cgroup root. +For further details, see +.BR cgroup_namespaces (7) +and the discussion of the +.B CLONE_NEWCGROUP +flag in +.BR clone (2). +.TP +See \fBclone\fP(2) for the exact semantics of the flags. +.SH OPTIONS +Various of the options below that relate to namespaces take an optional +.I file +argument. +This should be one of the +.I /proc/[pid]/ns/* +files described in +.BR namespaces (7). +.TP +\fB\-a\fR, \fB\-\-all\fR +Enter all namespaces of the target process by the default +.I /proc/[pid]/ns/* +namespace paths. The default paths to the target process namespaces may be +overwritten by namespace specific options (e.g. --all --mount=[path]). + +The user namespace will be ignored if the same as the caller's current user +namespace. It prevents a caller that has dropped capabilities from regaining +those capabilities via a call to setns(). See +.BR setns (2) +for more details. +.TP +\fB\-t\fR, \fB\-\-target\fR \fIpid\fP +Specify a target process to get contexts from. The paths to the contexts +specified by +.I pid +are: +.RS +.PD 0 +.IP "" 20 +.TP +/proc/\fIpid\fR/ns/mnt +the mount namespace +.TP +/proc/\fIpid\fR/ns/uts +the UTS namespace +.TP +/proc/\fIpid\fR/ns/ipc +the IPC namespace +.TP +/proc/\fIpid\fR/ns/net +the network namespace +.TP +/proc/\fIpid\fR/ns/pid +the PID namespace +.TP +/proc/\fIpid\fR/ns/user +the user namespace +.TP +/proc/\fIpid\fR/ns/cgroup +the cgroup namespace +.TP +/proc/\fIpid\fR/root +the root directory +.TP +/proc/\fIpid\fR/cwd +the working directory respectively +.PD +.RE +.TP +\fB\-m\fR, \fB\-\-mount\fR[=\fIfile\fR] +Enter the mount namespace. If no file is specified, enter the mount namespace +of the target process. +If +.I file +is specified, enter the mount namespace +specified by +.IR file . +.TP +\fB\-u\fR, \fB\-\-uts\fR[=\fIfile\fR] +Enter the UTS namespace. If no file is specified, enter the UTS namespace of +the target process. +If +.I file +is specified, enter the UTS namespace specified by +.IR file . +.TP +\fB\-i\fR, \fB\-\-ipc\fR[=\fIfile\fR] +Enter the IPC namespace. If no file is specified, enter the IPC namespace of +the target process. +If +.I file +is specified, enter the IPC namespace specified by +.IR file . +.TP +\fB\-n\fR, \fB\-\-net\fR[=\fIfile\fR] +Enter the network namespace. If no file is specified, enter the network +namespace of the target process. +If +.I file +is specified, enter the network namespace specified by +.IR file . +.TP +\fB\-p\fR, \fB\-\-pid\fR[=\fIfile\fR] +Enter the PID namespace. If no file is specified, enter the PID namespace of +the target process. +If +.I file +is specified, enter the PID namespace specified by +.IR file . +.TP +\fB\-U\fR, \fB\-\-user\fR[=\fIfile\fR] +Enter the user namespace. If no file is specified, enter the user namespace of +the target process. +If +.I file +is specified, enter the user namespace specified by +.IR file . +See also the \fB\-\-setuid\fR and \fB\-\-setgid\fR options. +.TP +\fB\-C\fR, \fB\-\-cgroup\fR[=\fIfile\fR] +Enter the cgroup namespace. If no file is specified, enter the cgroup namespace of +the target process. +If +.I file +is specified, enter the cgroup namespace specified by +.IR file . +.TP +\fB\-G\fR, \fB\-\-setgid\fR \fIgid\fR +Set the group ID which will be used in the entered namespace and drop +supplementary groups. +.BR nsenter (1) +always sets GID for user namespaces, the default is 0. +.TP +\fB\-S\fR, \fB\-\-setuid\fR \fIuid\fR +Set the user ID which will be used in the entered namespace. +.BR nsenter (1) +always sets UID for user namespaces, the default is 0. +.TP +\fB\-\-preserve\-credentials\fR +Don't modify UID and GID when enter user namespace. The default is to +drops supplementary groups and sets GID and UID to 0. +.TP +\fB\-r\fR, \fB\-\-root\fR[=\fIdirectory\fR] +Set the root directory. If no directory is specified, set the root directory to +the root directory of the target process. If directory is specified, set the +root directory to the specified directory. +.TP +\fB\-w\fR, \fB\-\-wd\fR[=\fIdirectory\fR] +Set the working directory. If no directory is specified, set the working +directory to the working directory of the target process. If directory is +specified, set the working directory to the specified directory. +.TP +\fB\-F\fR, \fB\-\-no\-fork\fR +Do not fork before exec'ing the specified program. By default, when entering a +PID namespace, \fBnsenter\fP calls \fBfork\fP before calling \fBexec\fP so that +any children will also be in the newly entered PID namespace. +.TP +\fB\-Z\fR, \fB\-\-follow\-context\fR +Set the SELinux security context used for executing a new process according to +already running process specified by \fB\-\-target\fR PID. (The util-linux has +to be compiled with SELinux support otherwise the option is unavailable.) +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.SH SEE ALSO +.BR clone (2), +.BR setns (2), +.BR namespaces (7) +.SH AUTHORS +.UR biederm@xmission.com +Eric Biederman +.UE +.br +.UR kzak@redhat.com +Karel Zak +.UE +.SH AVAILABILITY +The nsenter command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/nsenter.c b/sys-utils/nsenter.c new file mode 100644 index 0000000..fbfcf98 --- /dev/null +++ b/sys-utils/nsenter.c @@ -0,0 +1,484 @@ +/* + * nsenter(1) - command-line interface for setns(2) + * + * Copyright (C) 2012-2013 Eric Biederman <ebiederm@xmission.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <dirent.h> +#include <errno.h> +#include <getopt.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <unistd.h> +#include <assert.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <grp.h> +#include <sys/stat.h> + +#ifdef HAVE_LIBSELINUX +# include <selinux/selinux.h> +#endif + +#include "strutils.h" +#include "nls.h" +#include "c.h" +#include "closestream.h" +#include "namespace.h" +#include "exec_shell.h" + +static struct namespace_file { + int nstype; + const char *name; + int fd; +} namespace_files[] = { + /* Careful the order is significant in this array. + * + * The user namespace comes either first or last: first if + * you're using it to increase your privilege and last if + * you're using it to decrease. We enter the namespaces in + * two passes starting initially from offset 1 and then offset + * 0 if that fails. + */ + { .nstype = CLONE_NEWUSER, .name = "ns/user", .fd = -1 }, + { .nstype = CLONE_NEWCGROUP,.name = "ns/cgroup", .fd = -1 }, + { .nstype = CLONE_NEWIPC, .name = "ns/ipc", .fd = -1 }, + { .nstype = CLONE_NEWUTS, .name = "ns/uts", .fd = -1 }, + { .nstype = CLONE_NEWNET, .name = "ns/net", .fd = -1 }, + { .nstype = CLONE_NEWPID, .name = "ns/pid", .fd = -1 }, + { .nstype = CLONE_NEWNS, .name = "ns/mnt", .fd = -1 }, + { .nstype = 0, .name = NULL, .fd = -1 } +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run a program with namespaces of other processes.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all enter all namespaces\n"), out); + fputs(_(" -t, --target <pid> target process to get namespaces from\n"), out); + fputs(_(" -m, --mount[=<file>] enter mount namespace\n"), out); + fputs(_(" -u, --uts[=<file>] enter UTS namespace (hostname etc)\n"), out); + fputs(_(" -i, --ipc[=<file>] enter System V IPC namespace\n"), out); + fputs(_(" -n, --net[=<file>] enter network namespace\n"), out); + fputs(_(" -p, --pid[=<file>] enter pid namespace\n"), out); + fputs(_(" -C, --cgroup[=<file>] enter cgroup namespace\n"), out); + fputs(_(" -U, --user[=<file>] enter user namespace\n"), out); + fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out); + fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out); + fputs(_(" --preserve-credentials do not touch uids or gids\n"), out); + fputs(_(" -r, --root[=<dir>] set the root directory\n"), out); + fputs(_(" -w, --wd[=<dir>] set the working directory\n"), out); + fputs(_(" -F, --no-fork do not fork before exec'ing <program>\n"), out); +#ifdef HAVE_LIBSELINUX + fputs(_(" -Z, --follow-context set SELinux context according to --target PID\n"), out); +#endif + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + printf(USAGE_MAN_TAIL("nsenter(1)")); + + exit(EXIT_SUCCESS); +} + +static pid_t namespace_target_pid = 0; +static int root_fd = -1; +static int wd_fd = -1; + +static void open_target_fd(int *fd, const char *type, const char *path) +{ + char pathbuf[PATH_MAX]; + + if (!path && namespace_target_pid) { + snprintf(pathbuf, sizeof(pathbuf), "/proc/%u/%s", + namespace_target_pid, type); + path = pathbuf; + } + if (!path) + errx(EXIT_FAILURE, + _("neither filename nor target pid supplied for %s"), + type); + + if (*fd >= 0) + close(*fd); + + *fd = open(path, O_RDONLY); + if (*fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), path); +} + +static void open_namespace_fd(int nstype, const char *path) +{ + struct namespace_file *nsfile; + + for (nsfile = namespace_files; nsfile->nstype; nsfile++) { + if (nstype != nsfile->nstype) + continue; + + open_target_fd(&nsfile->fd, nsfile->name, path); + return; + } + /* This should never happen */ + assert(nsfile->nstype); +} + +static int get_ns_ino(const char *path, ino_t *ino) +{ + struct stat st; + + if (stat(path, &st) != 0) + return -errno; + *ino = st.st_ino; + return 0; +} + +static int is_same_namespace(pid_t a, pid_t b, const char *type) +{ + char path[PATH_MAX]; + ino_t a_ino = 0, b_ino = 0; + + snprintf(path, sizeof(path), "/proc/%u/%s", a, type); + if (get_ns_ino(path, &a_ino) != 0) + err(EXIT_FAILURE, _("stat of %s failed"), path); + + snprintf(path, sizeof(path), "/proc/%u/%s", b, type); + if (get_ns_ino(path, &b_ino) != 0) + err(EXIT_FAILURE, _("stat of %s failed"), path); + + return a_ino == b_ino; +} + +static void continue_as_child(void) +{ + pid_t child = fork(); + int status; + pid_t ret; + + if (child < 0) + err(EXIT_FAILURE, _("fork failed")); + + /* Only the child returns */ + if (child == 0) + return; + + for (;;) { + ret = waitpid(child, &status, WUNTRACED); + if ((ret == child) && (WIFSTOPPED(status))) { + /* The child suspended so suspend us as well */ + kill(getpid(), SIGSTOP); + kill(child, SIGCONT); + } else { + break; + } + } + /* Return the child's exit code if possible */ + if (WIFEXITED(status)) { + exit(WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + kill(getpid(), WTERMSIG(status)); + } + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) +{ + enum { + OPT_PRESERVE_CRED = CHAR_MAX + 1 + }; + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V'}, + { "target", required_argument, NULL, 't' }, + { "mount", optional_argument, NULL, 'm' }, + { "uts", optional_argument, NULL, 'u' }, + { "ipc", optional_argument, NULL, 'i' }, + { "net", optional_argument, NULL, 'n' }, + { "pid", optional_argument, NULL, 'p' }, + { "user", optional_argument, NULL, 'U' }, + { "cgroup", optional_argument, NULL, 'C' }, + { "setuid", required_argument, NULL, 'S' }, + { "setgid", required_argument, NULL, 'G' }, + { "root", optional_argument, NULL, 'r' }, + { "wd", optional_argument, NULL, 'w' }, + { "no-fork", no_argument, NULL, 'F' }, + { "preserve-credentials", no_argument, NULL, OPT_PRESERVE_CRED }, +#ifdef HAVE_LIBSELINUX + { "follow-context", no_argument, NULL, 'Z' }, +#endif + { NULL, 0, NULL, 0 } + }; + + struct namespace_file *nsfile; + int c, pass, namespaces = 0, setgroups_nerrs = 0, preserve_cred = 0; + bool do_rd = false, do_wd = false, force_uid = false, force_gid = false; + bool do_all = false; + int do_fork = -1; /* unknown yet */ + uid_t uid = 0; + gid_t gid = 0; +#ifdef HAVE_LIBSELINUX + bool selinux = 0; +#endif + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = + getopt_long(argc, argv, "+ahVt:m::u::i::n::p::C::U::S:G:r::w::FZ", + longopts, NULL)) != -1) { + switch (c) { + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'a': + do_all = true; + break; + case 't': + namespace_target_pid = + strtoul_or_err(optarg, _("failed to parse pid")); + break; + case 'm': + if (optarg) + open_namespace_fd(CLONE_NEWNS, optarg); + else + namespaces |= CLONE_NEWNS; + break; + case 'u': + if (optarg) + open_namespace_fd(CLONE_NEWUTS, optarg); + else + namespaces |= CLONE_NEWUTS; + break; + case 'i': + if (optarg) + open_namespace_fd(CLONE_NEWIPC, optarg); + else + namespaces |= CLONE_NEWIPC; + break; + case 'n': + if (optarg) + open_namespace_fd(CLONE_NEWNET, optarg); + else + namespaces |= CLONE_NEWNET; + break; + case 'p': + if (optarg) + open_namespace_fd(CLONE_NEWPID, optarg); + else + namespaces |= CLONE_NEWPID; + break; + case 'C': + if (optarg) + open_namespace_fd(CLONE_NEWCGROUP, optarg); + else + namespaces |= CLONE_NEWCGROUP; + break; + case 'U': + if (optarg) + open_namespace_fd(CLONE_NEWUSER, optarg); + else + namespaces |= CLONE_NEWUSER; + break; + case 'S': + uid = strtoul_or_err(optarg, _("failed to parse uid")); + force_uid = true; + break; + case 'G': + gid = strtoul_or_err(optarg, _("failed to parse gid")); + force_gid = true; + break; + case 'F': + do_fork = 0; + break; + case 'r': + if (optarg) + open_target_fd(&root_fd, "root", optarg); + else + do_rd = true; + break; + case 'w': + if (optarg) + open_target_fd(&wd_fd, "cwd", optarg); + else + do_wd = true; + break; + case OPT_PRESERVE_CRED: + preserve_cred = 1; + break; +#ifdef HAVE_LIBSELINUX + case 'Z': + selinux = 1; + break; +#endif + default: + errtryhelp(EXIT_FAILURE); + } + } + +#ifdef HAVE_LIBSELINUX + if (selinux && is_selinux_enabled() > 0) { + char *scon = NULL; + + if (!namespace_target_pid) + errx(EXIT_FAILURE, _("no target PID specified for --follow-context")); + if (getpidcon(namespace_target_pid, &scon) < 0) + errx(EXIT_FAILURE, _("failed to get %d SELinux context"), + (int) namespace_target_pid); + if (setexeccon(scon) < 0) + errx(EXIT_FAILURE, _("failed to set exec context to '%s'"), scon); + freecon(scon); + } +#endif + + if (do_all) { + if (!namespace_target_pid) + errx(EXIT_FAILURE, _("no target PID specified for --all")); + for (nsfile = namespace_files; nsfile->nstype; nsfile++) { + if (nsfile->fd >= 0) + continue; /* namespace already specified */ + + /* It is not permitted to use setns(2) to reenter the caller's + * current user namespace; see setns(2) man page for more details. + */ + if (nsfile->nstype & CLONE_NEWUSER + && is_same_namespace(getpid(), namespace_target_pid, nsfile->name)) + continue; + + namespaces |= nsfile->nstype; + } + } + + /* + * Open remaining namespace and directory descriptors. + */ + for (nsfile = namespace_files; nsfile->nstype; nsfile++) + if (nsfile->nstype & namespaces) + open_namespace_fd(nsfile->nstype, NULL); + if (do_rd) + open_target_fd(&root_fd, "root", NULL); + if (do_wd) + open_target_fd(&wd_fd, "cwd", NULL); + + /* + * Update namespaces variable to contain all requested namespaces + */ + for (nsfile = namespace_files; nsfile->nstype; nsfile++) { + if (nsfile->fd < 0) + continue; + namespaces |= nsfile->nstype; + } + + /* for user namespaces we always set UID and GID (default is 0) + * and clear root's groups if --preserve-credentials is no specified */ + if ((namespaces & CLONE_NEWUSER) && !preserve_cred) { + force_uid = true, force_gid = true; + + /* We call setgroups() before and after we enter user namespace, + * let's complain only if both fail */ + if (setgroups(0, NULL) != 0) + setgroups_nerrs++; + } + + /* + * Now that we know which namespaces we want to enter, enter + * them. Do this in two passes, not entering the user + * namespace on the first pass. So if we're deprivileging the + * container we'll enter the user namespace last and if we're + * privileging it then we enter the user namespace first + * (because the initial setns will fail). + */ + for (pass = 0; pass < 2; pass ++) { + for (nsfile = namespace_files + 1 - pass; nsfile->nstype; nsfile++) { + if (nsfile->fd < 0) + continue; + if (nsfile->nstype == CLONE_NEWPID && do_fork == -1) + do_fork = 1; + if (setns(nsfile->fd, nsfile->nstype)) { + if (pass != 0) + err(EXIT_FAILURE, + _("reassociate to namespace '%s' failed"), + nsfile->name); + else + continue; + } + + close(nsfile->fd); + nsfile->fd = -1; + } + } + + /* Remember the current working directory if I'm not changing it */ + if (root_fd >= 0 && wd_fd < 0) { + wd_fd = open(".", O_RDONLY); + if (wd_fd < 0) + err(EXIT_FAILURE, + _("cannot open current working directory")); + } + + /* Change the root directory */ + if (root_fd >= 0) { + if (fchdir(root_fd) < 0) + err(EXIT_FAILURE, + _("change directory by root file descriptor failed")); + + if (chroot(".") < 0) + err(EXIT_FAILURE, _("chroot failed")); + + close(root_fd); + root_fd = -1; + } + + /* Change the working directory */ + if (wd_fd >= 0) { + if (fchdir(wd_fd) < 0) + err(EXIT_FAILURE, + _("change directory by working directory file descriptor failed")); + + close(wd_fd); + wd_fd = -1; + } + + if (do_fork == 1) + continue_as_child(); + + if (force_uid || force_gid) { + if (force_gid && setgroups(0, NULL) != 0 && setgroups_nerrs) /* drop supplementary groups */ + err(EXIT_FAILURE, _("setgroups failed")); + if (force_gid && setgid(gid) < 0) /* change GID */ + err(EXIT_FAILURE, _("setgid failed")); + if (force_uid && setuid(uid) < 0) /* change UID */ + err(EXIT_FAILURE, _("setuid failed")); + } + + if (optind < argc) { + execvp(argv[optind], argv + optind); + errexec(argv[optind]); + } + exec_shell(); +} diff --git a/sys-utils/pivot_root.8 b/sys-utils/pivot_root.8 new file mode 100644 index 0000000..febedd0 --- /dev/null +++ b/sys-utils/pivot_root.8 @@ -0,0 +1,75 @@ +.TH PIVOT_ROOT 8 "August 2011" "util-linux" "System Administration" +.SH NAME +pivot_root \- change the root filesystem +.SH SYNOPSIS +.B pivot_root +.I new_root put_old +.SH DESCRIPTION +\fBpivot_root\fP moves the root file system of the current process to the +directory \fIput_old\fP and makes \fInew_root\fP the new root file system. +Since \fBpivot_root\fP(8) simply calls \fBpivot_root\fP(2), we refer to +the man page of the latter for further details. + +Note that, depending on the implementation of \fBpivot_root\fP, root and +cwd of the caller may or may not change. The following is a sequence for +invoking \fBpivot_root\fP that works in either case, assuming that +\fBpivot_root\fP and \fBchroot\fP are in the current \fBPATH\fP: +.sp +cd \fInew_root\fP +.br +pivot_root . \fIput_old\fP +.br +exec chroot . \fIcommand\fP +.sp +Note that \fBchroot\fP must be available under the old root and under the new +root, because \fBpivot_root\fP may or may not have implicitly changed the +root directory of the shell. + +Note that \fBexec chroot\fP changes the running executable, which is +necessary if the old root directory should be unmounted afterwards. +Also note that standard input, output, and error may still point to a +device on the old root file system, keeping it busy. They can easily be +changed when invoking \fBchroot\fP (see below; note the absence of +leading slashes to make it work whether \fBpivot_root\fP has changed the +shell's root or not). +.SH OPTIONS +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.SH EXAMPLES +Change the root file system to /dev/hda1 from an interactive shell: +.sp +.nf +mount /dev/hda1 /new-root +cd /new-root +pivot_root . old-root +exec chroot . sh <dev/console >dev/console 2>&1 +umount /old-root +.fi +.sp +Mount the new root file system over NFS from 10.0.0.1:/my_root and run +\fBinit\fP: +.sp +.nf +ifconfig lo 127.0.0.1 up # for portmap +# configure Ethernet or such +portmap # for lockd (implicitly started by mount) +mount -o ro 10.0.0.1:/my_root /mnt +killall portmap # portmap keeps old root busy +cd /mnt +pivot_root . old_root +exec chroot . sh -c 'umount /old_root; exec /sbin/init' \\ + <dev/console >dev/console 2>&1 +.fi +.SH "SEE ALSO" +.BR chroot (1), +.BR pivot_root (2), +.BR mount (8), +.BR switch_root (8), +.BR umount (8) +.SH AVAILABILITY +The pivot_root command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/pivot_root.c b/sys-utils/pivot_root.c new file mode 100644 index 0000000..ea76d94 --- /dev/null +++ b/sys-utils/pivot_root.c @@ -0,0 +1,80 @@ +/* + * pivot_root.c - Change the root file system + * + * Copyright (C) 2000 Werner Almesberger + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <err.h> +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" + +#define pivot_root(new_root,put_old) syscall(SYS_pivot_root,new_root,put_old) + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] new_root put_old\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Change the root filesystem.\n"), out); + + fputs(USAGE_OPTIONS, out); + printf(USAGE_HELP_OPTIONS(16)); + printf(USAGE_MAN_TAIL("pivot_root(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int ch; + static const struct option longopts[] = { + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((ch = getopt_long(argc, argv, "Vh", longopts, NULL)) != -1) + switch (ch) { + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + + if (argc != 3) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + if (pivot_root(argv[1], argv[2]) < 0) + err(EXIT_FAILURE, _("failed to change root from `%s' to `%s'"), + argv[1], argv[2]); + + return EXIT_SUCCESS; +} diff --git a/sys-utils/prlimit.1 b/sys-utils/prlimit.1 new file mode 100644 index 0000000..1230b3a --- /dev/null +++ b/sys-utils/prlimit.1 @@ -0,0 +1,120 @@ +.\" prlimit.1 -- +.\" Copyright 2011 Davidlohr Bueso <dave@gnu.org> +.\" May be distributed under the GNU General Public License + +.TH PRLIMIT 1 "July 2014" "util-linux" "User Commands" +.SH NAME +prlimit \- get and set process resource limits +.SH SYNOPSIS +.BR prlimit " [options]" +.RB [ \-\-\fIresource\fR [ =\fIlimits\fR] +.RB [ \-\-pid\ \fIPID\fR] + +.BR prlimit " [options]" +.RB [ \-\-\fIresource\fR [ =\fIlimits\fR] +.IR "command " [ argument ...] + +.SH DESCRIPTION +Given a process ID and one or more resources, \fBprlimit\fP tries to retrieve +and/or modify the limits. + +When \fIcommand\fR is given, +.B prlimit +will run this command with the given arguments. + +The \fIlimits\fP parameter is composed of a soft and a hard value, separated +by a colon (:), in order to modify the existing values. If no \fIlimits\fR are +given, \fBprlimit\fP will display the current values. If one of the values +is not given, then the existing one will be used. To specify the unlimited or +infinity limit (RLIM_INFINITY), the -1 or 'unlimited' string can be passed. + +Because of the nature of limits, the soft limit must be lower or equal to the +high limit (also called the ceiling). To see all available resource limits, +refer to the RESOURCE OPTIONS section. + +.IP "\fIsoft\fP:\fIhard\fP Specify both limits." +.IP "\fIsoft\fP: Specify only the soft limit." +.IP ":\fIhard\fP Specify only the hard limit." +.IP "\fIvalue\fP Specify both limits to the same value." + +.SH GENERAL OPTIONS +.IP "\fB\-h, \-\-help\fP" +Display help text and exit. +.IP "\fB\-\-noheadings\fP" +Do not print a header line. +.IP "\fB\-o, \-\-output \fIlist\fP" +Define the output columns to use. If no output arrangement is specified, +then a default set is used. +Use \fB\-\-help\fP to get a list of all supported columns. +.IP "\fB\-p, \-\-pid\fP" +Specify the process id; if none is given, the running process will be used. +.IP "\fB\-\-raw\fP" +Use the raw output format. +.IP "\fB\-\-verbose\fP" +Verbose mode. +.IP "\fB\-V, \-\-version\fP" +Display version information and exit. + +.SH RESOURCE OPTIONS +.IP "\fB\-c, \-\-core\fP[=\fIlimits\fR]" +Maximum size of a core file. +.IP "\fB\-d, \-\-data\fP[=\fIlimits\fR]" +Maximum data size. +.IP "\fB\-e, \-\-nice\fP[=\fIlimits\fR]" +Maximum nice priority allowed to raise. +.IP "\fB\-f, \-\-fsize\fP[=\fIlimits\fR]" +Maximum file size. +.IP "\fB\-i, \-\-sigpending\fP[=\fIlimits\fR]" +Maximum number of pending signals. +.IP "\fB\-l, \-\-memlock\fP[=\fIlimits\fR]" +Maximum locked-in-memory address space. +.IP "\fB\-m, \-\-rss\fP[=\fIlimits\fR]" +Maximum Resident Set Size (RSS). +.IP "\fB\-n, \-\-nofile\fP[=\fIlimits\fR]" +Maximum number of open files. +.IP "\fB\-q, \-\-msgqueue\fP[=\fIlimits\fR]" +Maximum number of bytes in POSIX message queues. +.IP "\fB\-r, \-\-rtprio\fP[=\fIlimits\fR]" +Maximum real-time priority. +.IP "\fB\-s, \-\-stack\fP[=\fIlimits\fR]" +Maximum size of the stack. +.IP "\fB\-t, \-\-cpu\fP[=\fIlimits\fR]" +CPU time, in seconds. +.IP "\fB\-u, \-\-nproc\fP[=\fIlimits\fR]" +Maximum number of processes. +.IP "\fB\-v, \-\-as\fP[=\fIlimits\fR]" +Address space limit. +.IP "\fB\-x, \-\-locks\fP[=\fIlimits\fR]" +Maximum number of file locks held. +.IP "\fB\-y, \-\-rttime\fP[=\fIlimits\fR]" +Timeout for real-time tasks. + +.SH EXAMPLES +.IP "\fBprlimit \-\-pid 13134\fP" +Display limit values for all current resources. +.IP "\fBprlimit \-\-pid 13134 \--rss --nofile=1024:4095\fP" +Display the limits of the RSS, and set the soft and hard limits for the number +of open files to 1024 and 4095, respectively. +.IP "\fBprlimit \-\-pid 13134 --nproc=512:\fP" +Modify only the soft limit for the number of processes. +.IP "\fBprlimit \-\-pid $$ --nproc=unlimited\fP" +Set for the current process both the soft and ceiling values for the number of +processes to unlimited. +.IP "\fBprlimit --cpu=10 sort -u hugefile\fP" +Set both the soft and hard CPU time limit to ten seconds and run 'sort'. + +.SH "SEE ALSO" +.BR ulimit (1), +.BR prlimit (2) + +.SH NOTES +The prlimit system call is supported since Linux 2.6.36, older kernels will +break this program. + +.SH AUTHORS +.nf +Davidlohr Bueso <dave@gnu.org> - In memory of Dennis M. Ritchie. +.fi +.SH AVAILABILITY +The prlimit command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/prlimit.c b/sys-utils/prlimit.c new file mode 100644 index 0000000..6f80636 --- /dev/null +++ b/sys-utils/prlimit.c @@ -0,0 +1,646 @@ +/* + * prlimit - get/set process resource limits. + * + * Copyright (C) 2011 Davidlohr Bueso <dave@gnu.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <assert.h> +#include <unistd.h> +#include <sys/resource.h> + +#include <libsmartcols.h> + +#include "c.h" +#include "nls.h" +#include "xalloc.h" +#include "strutils.h" +#include "list.h" +#include "closestream.h" + +#ifndef RLIMIT_RTTIME +# define RLIMIT_RTTIME 15 +#endif + +enum { + AS, + CORE, + CPU, + DATA, + FSIZE, + LOCKS, + MEMLOCK, + MSGQUEUE, + NICE, + NOFILE, + NPROC, + RSS, + RTPRIO, + RTTIME, + SIGPENDING, + STACK +}; + +/* basic output flags */ +static int no_headings; +static int raw; + +struct prlimit_desc { + const char *name; + const char *help; + const char *unit; + int resource; +}; + +static struct prlimit_desc prlimit_desc[] = +{ + [AS] = { "AS", N_("address space limit"), N_("bytes"), RLIMIT_AS }, + [CORE] = { "CORE", N_("max core file size"), N_("bytes"), RLIMIT_CORE }, + [CPU] = { "CPU", N_("CPU time"), N_("seconds"), RLIMIT_CPU }, + [DATA] = { "DATA", N_("max data size"), N_("bytes"), RLIMIT_DATA }, + [FSIZE] = { "FSIZE", N_("max file size"), N_("bytes"), RLIMIT_FSIZE }, + [LOCKS] = { "LOCKS", N_("max number of file locks held"), N_("locks"), RLIMIT_LOCKS }, + [MEMLOCK] = { "MEMLOCK", N_("max locked-in-memory address space"), N_("bytes"), RLIMIT_MEMLOCK }, + [MSGQUEUE] = { "MSGQUEUE", N_("max bytes in POSIX mqueues"), N_("bytes"), RLIMIT_MSGQUEUE }, + [NICE] = { "NICE", N_("max nice prio allowed to raise"), NULL, RLIMIT_NICE }, + [NOFILE] = { "NOFILE", N_("max number of open files"), N_("files"), RLIMIT_NOFILE }, + [NPROC] = { "NPROC", N_("max number of processes"), N_("processes"), RLIMIT_NPROC }, + [RSS] = { "RSS", N_("max resident set size"), N_("bytes"), RLIMIT_RSS }, + [RTPRIO] = { "RTPRIO", N_("max real-time priority"), NULL, RLIMIT_RTPRIO }, + [RTTIME] = { "RTTIME", N_("timeout for real-time tasks"), N_("microsecs"), RLIMIT_RTTIME }, + [SIGPENDING] = { "SIGPENDING", N_("max number of pending signals"), N_("signals"), RLIMIT_SIGPENDING }, + [STACK] = { "STACK", N_("max stack size"), N_("bytes"), RLIMIT_STACK } +}; + +#define MAX_RESOURCES ARRAY_SIZE(prlimit_desc) + +struct prlimit { + struct list_head lims; + + struct rlimit rlim; + struct prlimit_desc *desc; + int modify; /* PRLIMIT_{SOFT,HARD} mask */ +}; + +#define PRLIMIT_EMPTY_LIMIT {{ 0, 0, }, NULL, 0 } + +enum { + COL_HELP, + COL_RES, + COL_SOFT, + COL_HARD, + COL_UNITS, +}; + +/* column names */ +struct colinfo { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; +}; + +/* columns descriptions */ +static struct colinfo infos[] = { + [COL_RES] = { "RESOURCE", 0.25, SCOLS_FL_TRUNC, N_("resource name") }, + [COL_HELP] = { "DESCRIPTION", 0.1, SCOLS_FL_TRUNC, N_("resource description")}, + [COL_SOFT] = { "SOFT", 0.1, SCOLS_FL_RIGHT, N_("soft limit")}, + [COL_HARD] = { "HARD", 1, SCOLS_FL_RIGHT, N_("hard limit (ceiling)")}, + [COL_UNITS] = { "UNITS", 0.1, SCOLS_FL_TRUNC, N_("units")}, +}; + +static int columns[ARRAY_SIZE(infos) * 2]; +static int ncolumns; + + + +#define INFINITY_STR "unlimited" +#define INFINITY_STRLEN (sizeof(INFINITY_STR) - 1) + +#define PRLIMIT_SOFT (1 << 1) +#define PRLIMIT_HARD (1 << 2) + +static pid_t pid; /* calling process (default) */ +static int verbose; + +#ifndef HAVE_PRLIMIT +# include <sys/syscall.h> +static int prlimit(pid_t p, int resource, + const struct rlimit *new_limit, + struct rlimit *old_limit) +{ + return syscall(SYS_prlimit64, p, resource, new_limit, old_limit); +} +#endif + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + + fprintf(out, + _(" %s [options] [-p PID]\n"), program_invocation_short_name); + fprintf(out, + _(" %s [options] COMMAND\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Show or change the resource limits of a process.\n"), out); + + fputs(_("\nGeneral Options:\n"), out); + fputs(_(" -p, --pid <pid> process id\n" + " -o, --output <list> define which output columns to use\n" + " --noheadings don't print headings\n" + " --raw use the raw output format\n" + " --verbose verbose output\n" + ), out); + printf(USAGE_HELP_OPTIONS(24)); + + fputs(_("\nResources Options:\n"), out); + fputs(_(" -c, --core maximum size of core files created\n" + " -d, --data maximum size of a process's data segment\n" + " -e, --nice maximum nice priority allowed to raise\n" + " -f, --fsize maximum size of files written by the process\n" + " -i, --sigpending maximum number of pending signals\n" + " -l, --memlock maximum size a process may lock into memory\n" + " -m, --rss maximum resident set size\n" + " -n, --nofile maximum number of open files\n" + " -q, --msgqueue maximum bytes in POSIX message queues\n" + " -r, --rtprio maximum real-time scheduling priority\n" + " -s, --stack maximum stack size\n" + " -t, --cpu maximum amount of CPU time in seconds\n" + " -u, --nproc maximum number of user processes\n" + " -v, --as size of virtual memory\n" + " -x, --locks maximum number of file locks\n" + " -y, --rttime CPU time in microseconds a process scheduled\n" + " under real-time scheduling\n"), out); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %11s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("prlimit(1)")); + + exit(EXIT_SUCCESS); +} + +static inline int get_column_id(int num) +{ + assert(num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(infos)); + + return columns[num]; +} + +static inline struct colinfo *get_column_info(unsigned num) +{ + return &infos[ get_column_id(num) ]; +} + +static void add_scols_line(struct libscols_table *table, struct prlimit *l) +{ + int i; + struct libscols_line *line; + + assert(table); + assert(l); + + line = scols_table_new_line(table, NULL); + if (!line) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (i = 0; i < ncolumns; i++) { + char *str = NULL; + + switch (get_column_id(i)) { + case COL_RES: + str = xstrdup(l->desc->name); + break; + case COL_HELP: + str = xstrdup(l->desc->help); + break; + case COL_SOFT: + if (l->rlim.rlim_cur == RLIM_INFINITY) + str = xstrdup(_("unlimited")); + else + xasprintf(&str, "%llu", (unsigned long long) l->rlim.rlim_cur); + break; + case COL_HARD: + if (l->rlim.rlim_max == RLIM_INFINITY) + str = xstrdup(_("unlimited")); + else + xasprintf(&str, "%llu", (unsigned long long) l->rlim.rlim_max); + break; + case COL_UNITS: + str = l->desc->unit ? xstrdup(_(l->desc->unit)) : NULL; + break; + default: + break; + } + + if (str && scols_line_refer_data(line, i, str)) + err(EXIT_FAILURE, _("failed to add output data")); + } +} + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + assert(name); + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static void rem_prlim(struct prlimit *lim) +{ + if (!lim) + return; + list_del(&lim->lims); + free(lim); +} + +static int show_limits(struct list_head *lims) +{ + int i; + struct list_head *p, *pnext; + struct libscols_table *table; + + table = scols_new_table(); + if (!table) + err(EXIT_FAILURE, _("failed to allocate output table")); + + scols_table_enable_raw(table, raw); + scols_table_enable_noheadings(table, no_headings); + + for (i = 0; i < ncolumns; i++) { + struct colinfo *col = get_column_info(i); + + if (!scols_table_new_column(table, col->name, col->whint, col->flags)) + err(EXIT_FAILURE, _("failed to allocate output column")); + } + + list_for_each_safe(p, pnext, lims) { + struct prlimit *lim = list_entry(p, struct prlimit, lims); + + add_scols_line(table, lim); + rem_prlim(lim); + } + + scols_print_table(table); + scols_unref_table(table); + return 0; +} + +/* + * If one of the limits is unknown (default value for not being passed), we + * need to get the current limit and use it. I see no other way other than + * using prlimit(2). + */ +static void get_unknown_hardsoft(struct prlimit *lim) +{ + struct rlimit old; + + if (prlimit(pid, lim->desc->resource, NULL, &old) == -1) + err(EXIT_FAILURE, _("failed to get old %s limit"), + lim->desc->name); + + if (!(lim->modify & PRLIMIT_SOFT)) + lim->rlim.rlim_cur = old.rlim_cur; + else if (!(lim->modify & PRLIMIT_HARD)) + lim->rlim.rlim_max = old.rlim_max; +} + +static void do_prlimit(struct list_head *lims) +{ + struct list_head *p, *pnext; + + list_for_each_safe(p, pnext, lims) { + struct rlimit *new = NULL, *old = NULL; + struct prlimit *lim = list_entry(p, struct prlimit, lims); + + if (lim->modify) { + if (lim->modify != (PRLIMIT_HARD | PRLIMIT_SOFT)) + get_unknown_hardsoft(lim); + + if ((lim->rlim.rlim_cur > lim->rlim.rlim_max) && + (lim->rlim.rlim_cur != RLIM_INFINITY || + lim->rlim.rlim_max != RLIM_INFINITY)) + errx(EXIT_FAILURE, _("the soft limit %s cannot exceed the hard limit"), + lim->desc->name); + new = &lim->rlim; + } else + old = &lim->rlim; + + if (verbose && new) { + printf(_("New %s limit for pid %d: "), lim->desc->name, + pid ? pid : getpid()); + if (new->rlim_cur == RLIM_INFINITY) + printf("<%s", _("unlimited")); + else + printf("<%ju", (uintmax_t)new->rlim_cur); + + if (new->rlim_max == RLIM_INFINITY) + printf(":%s>\n", _("unlimited")); + else + printf(":%ju>\n", (uintmax_t)new->rlim_max); + } + + if (prlimit(pid, lim->desc->resource, new, old) == -1) + err(EXIT_FAILURE, lim->modify ? + _("failed to set the %s resource limit") : + _("failed to get the %s resource limit"), + lim->desc->name); + + if (lim->modify) + rem_prlim(lim); /* modify only; don't show */ + } +} + +static int get_range(char *str, rlim_t *soft, rlim_t *hard, int *found) +{ + char *end = NULL; + + if (!str) + return 0; + + *found = errno = 0; + *soft = *hard = RLIM_INFINITY; + + if (!strcmp(str, INFINITY_STR)) { /* <unlimited> */ + *found |= PRLIMIT_SOFT | PRLIMIT_HARD; + return 0; + + } else if (*str == ':') { /* <:hard> */ + str++; + + if (strcmp(str, INFINITY_STR) != 0) { + *hard = strtoull(str, &end, 10); + + if (errno || !end || *end || end == str) + return -1; + } + *found |= PRLIMIT_HARD; + return 0; + + } + + if (strncmp(str, INFINITY_STR, INFINITY_STRLEN) == 0) { + /* <unlimited> or <unlimited:> */ + end = str + INFINITY_STRLEN; + } else { + /* <value> or <soft:> */ + *hard = *soft = strtoull(str, &end, 10); + if (errno || !end || end == str) + return -1; + } + + if (*end == ':' && !*(end + 1)) /* <soft:> */ + *found |= PRLIMIT_SOFT; + + else if (*end == ':') { /* <soft:hard> */ + str = end + 1; + + if (!strcmp(str, INFINITY_STR)) + *hard = RLIM_INFINITY; + else { + end = NULL; + errno = 0; + *hard = strtoull(str, &end, 10); + + if (errno || !end || *end || end == str) + return -1; + } + *found |= PRLIMIT_SOFT | PRLIMIT_HARD; + + } else /* <value> */ + *found |= PRLIMIT_SOFT | PRLIMIT_HARD; + + return 0; +} + + +static int parse_prlim(struct rlimit *lim, char *ops, size_t id) +{ + rlim_t soft, hard; + int found = 0; + + if (get_range(ops, &soft, &hard, &found)) + errx(EXIT_FAILURE, _("failed to parse %s limit"), + prlimit_desc[id].name); + + lim->rlim_cur = soft; + lim->rlim_max = hard; + + return found; +} + +static int add_prlim(char *ops, struct list_head *lims, size_t id) +{ + struct prlimit *lim = xcalloc(1, sizeof(*lim)); + + INIT_LIST_HEAD(&lim->lims); + lim->desc = &prlimit_desc[id]; + + if (ops) + lim->modify = parse_prlim(&lim->rlim, ops, id); + + list_add_tail(&lim->lims, lims); + return 0; +} + +int main(int argc, char **argv) +{ + int opt; + struct list_head lims; + + enum { + VERBOSE_OPTION = CHAR_MAX + 1, + RAW_OPTION, + NOHEADINGS_OPTION + }; + + static const struct option longopts[] = { + { "pid", required_argument, NULL, 'p' }, + { "output", required_argument, NULL, 'o' }, + { "as", optional_argument, NULL, 'v' }, + { "core", optional_argument, NULL, 'c' }, + { "cpu", optional_argument, NULL, 't' }, + { "data", optional_argument, NULL, 'd' }, + { "fsize", optional_argument, NULL, 'f' }, + { "locks", optional_argument, NULL, 'x' }, + { "memlock", optional_argument, NULL, 'l' }, + { "msgqueue", optional_argument, NULL, 'q' }, + { "nice", optional_argument, NULL, 'e' }, + { "nofile", optional_argument, NULL, 'n' }, + { "nproc", optional_argument, NULL, 'u' }, + { "rss", optional_argument, NULL, 'm' }, + { "rtprio", optional_argument, NULL, 'r' }, + { "rttime", optional_argument, NULL, 'y' }, + { "sigpending", optional_argument, NULL, 'i' }, + { "stack", optional_argument, NULL, 's' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { "noheadings", no_argument, NULL, NOHEADINGS_OPTION }, + { "raw", no_argument, NULL, RAW_OPTION }, + { "verbose", no_argument, NULL, VERBOSE_OPTION }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + INIT_LIST_HEAD(&lims); + + /* + * Something is very wrong if this doesn't succeed, + * assuming STACK is the last resource, of course. + */ + assert(MAX_RESOURCES == STACK + 1); + + while((opt = getopt_long(argc, argv, + "+c::d::e::f::i::l::m::n::q::r::s::t::u::v::x::y::p:o:vVh", + longopts, NULL)) != -1) { + switch(opt) { + case 'c': + add_prlim(optarg, &lims, CORE); + break; + case 'd': + add_prlim(optarg, &lims, DATA); + break; + case 'e': + add_prlim(optarg, &lims, NICE); + break; + case 'f': + add_prlim(optarg, &lims, FSIZE); + break; + case 'i': + add_prlim(optarg, &lims, SIGPENDING); + break; + case 'l': + add_prlim(optarg, &lims, MEMLOCK); + break; + case 'm': + add_prlim(optarg, &lims, RSS); + break; + case 'n': + add_prlim(optarg, &lims, NOFILE); + break; + case 'q': + add_prlim(optarg, &lims, MSGQUEUE); + break; + case 'r': + add_prlim(optarg, &lims, RTPRIO); + break; + case 's': + add_prlim(optarg, &lims, STACK); + break; + case 't': + add_prlim(optarg, &lims, CPU); + break; + case 'u': + add_prlim(optarg, &lims, NPROC); + break; + case 'v': + add_prlim(optarg, &lims, AS); + break; + case 'x': + add_prlim(optarg, &lims, LOCKS); + break; + case 'y': + add_prlim(optarg, &lims, RTTIME); + break; + + case 'p': + if (pid) + errx(EXIT_FAILURE, _("option --pid may be specified only once")); + pid = strtos32_or_err(optarg, _("invalid PID argument")); + break; + case 'h': + usage(); + case 'o': + ncolumns = string_to_idarray(optarg, + columns, ARRAY_SIZE(columns), + column_name_to_id); + if (ncolumns < 0) + return EXIT_FAILURE; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + + case NOHEADINGS_OPTION: + no_headings = 1; + break; + case VERBOSE_OPTION: + verbose++; + break; + case RAW_OPTION: + raw = 1; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + if (argc > optind && pid) + errx(EXIT_FAILURE, _("options --pid and COMMAND are mutually exclusive")); + if (!ncolumns) { + /* default columns */ + columns[ncolumns++] = COL_RES; + columns[ncolumns++] = COL_HELP; + columns[ncolumns++] = COL_SOFT; + columns[ncolumns++] = COL_HARD; + columns[ncolumns++] = COL_UNITS; + } + + scols_init_debug(0); + + if (list_empty(&lims)) { + /* default is to print all resources */ + size_t n; + + for (n = 0; n < MAX_RESOURCES; n++) + add_prlim(NULL, &lims, n); + } + + do_prlimit(&lims); + + if (!list_empty(&lims)) + show_limits(&lims); + + if (argc > optind) { + /* prlimit [options] COMMAND */ + execvp(argv[optind], &argv[optind]); + errexec(argv[optind]); + } + + return EXIT_SUCCESS; +} diff --git a/sys-utils/readprofile.8 b/sys-utils/readprofile.8 new file mode 100644 index 0000000..a37aa0a --- /dev/null +++ b/sys-utils/readprofile.8 @@ -0,0 +1,153 @@ +.TH READPROFILE "8" "October 2011" "util-linux" "System Administration" +.SH NAME +readprofile \- read kernel profiling information +.SH SYNOPSIS +.B readprofile +[options] +.SH VERSION +This manpage documents version 2.0 of the program. +.SH DESCRIPTION +.LP +The +.B readprofile +command uses the +.I /proc/profile +information to print ascii data on standard output. The output is +organized in three columns: the first is the number of clock ticks, +the second is the name of the C function in the kernel where those +many ticks occurred, and the third is the normalized `load' of the +procedure, calculated as a ratio between the number of ticks and the +length of the procedure. The output is filled with blanks to ease +readability. +.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +Print all symbols in the mapfile. By default the procedures with +reported ticks are not printed. +.TP +\fB\-b\fR, \fB\-\-histbin\fR +Print individual histogram-bin counts. +.TP +\fB\-i\fR, \fB\-\-info\fR +Info. This makes +.B readprofile +only print the profiling step used by the kernel. The profiling step +is the resolution of the profiling buffer, and is chosen during +kernel configuration (through `make config'), or in the kernel's +command line. If the +.B \-t +(terse) switch is used together with +.B \-i +only the decimal number is printed. +.TP +\fB\-m\fR, \fB\-\-mapfile\fR \fImapfile\fR +Specify a mapfile, which by default is +.IR /usr/src/linux/System.map . +You should specify the map file on cmdline if your current kernel +isn't the last one you compiled, or if you keep System.map elsewhere. +If the name of the map file ends with `.gz' it is decompressed on the +fly. +.TP +\fB\-M\fR, \fB\-\-multiplier\fR \fImultiplier\fR +On some architectures it is possible to alter the frequency at which +the kernel delivers profiling interrupts to each CPU. This option +allows you to set the frequency, as a multiplier of the system clock +frequency, HZ. Linux 2.6.16 dropped multiplier support for most systems. +This option also resets the profiling buffer, and requires superuser +privileges. +.TP +\fB\-p\fR, \fB\-\-profile\fR \fIpro-file\fR +Specify a different profiling buffer, which by default is +.IR /proc/profile . +Using a different pro-file is useful if you want to `freeze' the +kernel profiling at some time and read it later. The +.I /proc/profile +file can be copied using `cat' or `cp'. There is no more support for +compressed profile buffers, like in +.B readprofile-1.1, +because the program needs to know the size of the buffer in advance. +.TP +\fB\-r\fR, \fB\-\-reset\fR +Reset the profiling buffer. This can only be invoked by root, +because +.I /proc/profile +is readable by everybody but writable only by the superuser. +However, you can make +.B readprofile +set-user-ID 0, in order to reset the buffer without gaining privileges. +.TP +\fB\-s, \fB\-\-counters\fR +Print individual counters within functions. +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Verbose. The output is organized in four columns and filled with +blanks. The first column is the RAM address of a kernel function, +the second is the name of the function, the third is the number of +clock ticks and the last is the normalized load. +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and exit. +.TP +\fB\-h\fR, \fB\-\-help\fR +Display help text and exit. +.SH EXAMPLES +Browse the profiling buffer ordering by clock ticks: +.nf + readprofile | sort -nr | less + +.fi +Print the 20 most loaded procedures: +.nf + readprofile | sort -nr +2 | head -20 + +.fi +Print only filesystem profile: +.nf + readprofile | grep _ext2 + +.fi +Look at all the kernel information, with ram addresses: +.nf + readprofile -av | less + +.fi +Browse a `frozen' profile buffer for a non current kernel: +.nf + readprofile -p ~/profile.freeze -m /zImage.map.gz + +.fi +Request profiling at 2kHz per CPU, and reset the profiling buffer: +.nf + sudo readprofile -M 20 +.fi +.SH BUGS +.LP +.B readprofile +only works with a 1.3.x or newer kernel, because +.I /proc/profile +changed in the step from 1.2 to 1.3 +.LP +This program only works with ELF kernels. The change for a.out +kernels is trivial, and left as an exercise to the a.out user. +.LP +To enable profiling, the kernel must be rebooted, because no +profiling module is available, and it wouldn't be easy to build. To +enable profiling, you can specify "profile=2" (or another number) on +the kernel commandline. The number you specify is the two-exponent +used as profiling step. +.LP +Profiling is disabled when interrupts are inhibited. This means that +many profiling ticks happen when interrupts are re-enabled. Watch +out for misleading information. +.SH FILES +.nf +/proc/profile A binary snapshot of the profiling buffer. +/usr/src/linux/System.map The symbol table for the kernel. +/usr/src/linux/* The program being profiled :-) +.fi +.SH AVAILABILITY +The readprofile command is part of the util-linux package and is +available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/readprofile.c b/sys-utils/readprofile.c new file mode 100644 index 0000000..0350738 --- /dev/null +++ b/sys-utils/readprofile.c @@ -0,0 +1,407 @@ +/* + * readprofile.c - used to read /proc/profile + * + * Copyright (C) 1994,1996 Alessandro Rubini (rubini@ipvvis.unipv.it) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + * 1999-09-01 Stephane Eranian <eranian@cello.hpl.hp.com> + * - 64bit clean patch + * 3Feb2001 Andrew Morton <andrewm@uow.edu.au> + * - -M option to write profile multiplier. + * 2001-11-07 Werner Almesberger <wa@almesberger.net> + * - byte order auto-detection and -n option + * 2001-11-09 Werner Almesberger <wa@almesberger.net> + * - skip step size (index 0) + * 2002-03-09 John Levon <moz@compsoc.man.ac.uk> + * - make maplineno do something + * 2002-11-28 Mads Martin Joergensen + + * - also try /boot/System.map-`uname -r` + * 2003-04-09 Werner Almesberger <wa@almesberger.net> + * - fixed off-by eight error and improved heuristics in byte order detection + * 2003-08-12 Nikita Danilov <Nikita@Namesys.COM> + * - added -s option; example of use: + * "readprofile -s -m /boot/System.map-test | grep __d_lookup | sort -n -k3" + */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/utsname.h> +#include <unistd.h> + +#include "nls.h" +#include "xalloc.h" +#include "closestream.h" + +#define S_LEN 128 + +/* These are the defaults */ +static char defaultmap[]="/boot/System.map"; +static char defaultpro[]="/proc/profile"; + +static FILE *myopen(char *name, char *mode, int *flag) +{ + int len = strlen(name); + + if (!strcmp(name + len - 3, ".gz")) { + FILE *res; + char *cmdline = xmalloc(len + 6); + sprintf(cmdline, "zcat %s", name); + res = popen(cmdline, mode); + free(cmdline); + *flag = 1; + return res; + } + *flag = 0; + return fopen(name, mode); +} + +#ifndef BOOT_SYSTEM_MAP +#define BOOT_SYSTEM_MAP "/boot/System.map-" +#endif + +static char *boot_uname_r_str(void) +{ + struct utsname uname_info; + char *s; + size_t len; + + if (uname(&uname_info)) + return ""; + len = strlen(BOOT_SYSTEM_MAP) + strlen(uname_info.release) + 1; + s = xmalloc(len); + strcpy(s, BOOT_SYSTEM_MAP); + strcat(s, uname_info.release); + return s; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Display kernel profiling information.\n"), out); + + fputs(USAGE_OPTIONS, out); + fprintf(out, + _(" -m, --mapfile <mapfile> (defaults: \"%s\" and\n"), defaultmap); + fprintf(out, + _(" \"%s\")\n"), boot_uname_r_str()); + fprintf(out, + _(" -p, --profile <pro-file> (default: \"%s\")\n"), defaultpro); + fputs(_(" -M, --multiplier <mult> set the profiling multiplier to <mult>\n"), out); + fputs(_(" -i, --info print only info about the sampling step\n"), out); + fputs(_(" -v, --verbose print verbose data\n"), out); + fputs(_(" -a, --all print all symbols, even if count is 0\n"), out); + fputs(_(" -b, --histbin print individual histogram-bin counts\n"), out); + fputs(_(" -s, --counters print individual counters within functions\n"), out); + fputs(_(" -r, --reset reset all the counters (root only)\n"), out); + fputs(_(" -n, --no-auto disable byte order auto-detection\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(27)); + printf(USAGE_MAN_TAIL("readprofile(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + FILE *map; + int proFd; + char *mapFile, *proFile, *mult = NULL; + size_t len = 0, indx = 1; + unsigned long long add0 = 0; + unsigned int step; + unsigned int *buf, total, fn_len; + unsigned long long fn_add, next_add; /* current and next address */ + char fn_name[S_LEN], next_name[S_LEN]; /* current and next name */ + char mode[8]; + int c; + ssize_t rc; + int optAll = 0, optInfo = 0, optReset = 0, optVerbose = 0, optNative = 0; + int optBins = 0, optSub = 0; + char mapline[S_LEN]; + int maplineno = 1; + int popenMap; /* flag to tell if popen() has been used */ + int header_printed; + + static const struct option longopts[] = { + {"mapfile", required_argument, NULL, 'm'}, + {"profile", required_argument, NULL, 'p'}, + {"multiplier", required_argument, NULL, 'M'}, + {"info", no_argument, NULL, 'i'}, + {"verbose", no_argument, NULL, 'v'}, + {"all", no_argument, NULL, 'a'}, + {"histbin", no_argument, NULL, 'b'}, + {"counters", no_argument, NULL, 's'}, + {"reset", no_argument, NULL, 'r'}, + {"no-auto", no_argument, NULL, 'n'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + +#define next (current^1) + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + proFile = defaultpro; + mapFile = defaultmap; + + while ((c = getopt_long(argc, argv, "m:p:M:ivabsrnVh", longopts, NULL)) != -1) { + switch (c) { + case 'm': + mapFile = optarg; + break; + case 'n': + optNative++; + break; + case 'p': + proFile = optarg; + break; + case 'a': + optAll++; + break; + case 'b': + optBins++; + break; + case 's': + optSub++; + break; + case 'i': + optInfo++; + break; + case 'M': + mult = optarg; + break; + case 'r': + optReset++; + break; + case 'v': + optVerbose++; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optReset || mult) { + int multiplier, fd, to_write; + + /* When writing the multiplier, if the length of the + * write is not sizeof(int), the multiplier is not + * changed. */ + if (mult) { + multiplier = strtoul(mult, NULL, 10); + to_write = sizeof(int); + } else { + multiplier = 0; + /* sth different from sizeof(int) */ + to_write = 1; + } + /* try to become root, just in case */ + ignore_result( setuid(0) ); + fd = open(defaultpro, O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, "%s", defaultpro); + if (write(fd, &multiplier, to_write) != to_write) + err(EXIT_FAILURE, _("error writing %s"), defaultpro); + close(fd); + exit(EXIT_SUCCESS); + } + + /* Use an fd for the profiling buffer, to skip stdio overhead */ + if (((proFd = open(proFile, O_RDONLY)) < 0) + || ((int)(len = lseek(proFd, 0, SEEK_END)) < 0) + || (lseek(proFd, 0, SEEK_SET) < 0)) + err(EXIT_FAILURE, "%s", proFile); + + buf = xmalloc(len); + + rc = read(proFd, buf, len); + if (rc < 0 || (size_t) rc != len) + err(EXIT_FAILURE, "%s", proFile); + close(proFd); + + if (!optNative) { + int entries = len / sizeof(*buf); + int big = 0, small = 0; + unsigned *p; + size_t i; + + for (p = buf + 1; p < buf + entries; p++) { + if (*p & ~0U << (sizeof(*buf) * 4)) + big++; + if (*p & ((1 << (sizeof(*buf) * 4)) - 1)) + small++; + } + if (big > small) { + warnx(_("Assuming reversed byte order. " + "Use -n to force native byte order.")); + for (p = buf; p < buf + entries; p++) + for (i = 0; i < sizeof(*buf) / 2; i++) { + unsigned char *b = (unsigned char *)p; + unsigned char tmp; + tmp = b[i]; + b[i] = b[sizeof(*buf) - i - 1]; + b[sizeof(*buf) - i - 1] = tmp; + } + } + } + + step = buf[0]; + if (optInfo) { + printf(_("Sampling_step: %u\n"), step); + exit(EXIT_SUCCESS); + } + + total = 0; + + map = myopen(mapFile, "r", &popenMap); + if (map == NULL && mapFile == defaultmap) { + mapFile = boot_uname_r_str(); + map = myopen(mapFile, "r", &popenMap); + } + if (map == NULL) + err(EXIT_FAILURE, "%s", mapFile); + + while (fgets(mapline, S_LEN, map)) { + if (sscanf(mapline, "%llx %7[^\n ] %127[^\n ]", &fn_add, mode, fn_name) != 3) + errx(EXIT_FAILURE, _("%s(%i): wrong map line"), mapFile, + maplineno); + /* only elf works like this */ + if (!strcmp(fn_name, "_stext") || !strcmp(fn_name, "__stext")) { + add0 = fn_add; + break; + } + maplineno++; + } + + if (!add0) + errx(EXIT_FAILURE, _("can't find \"_stext\" in %s"), mapFile); + + /* + * Main loop. + */ + while (fgets(mapline, S_LEN, map)) { + unsigned int this = 0; + int done = 0; + + if (sscanf(mapline, "%llx %7[^\n ] %127[^\n ]", &next_add, mode, next_name) != 3) + errx(EXIT_FAILURE, _("%s(%i): wrong map line"), mapFile, + maplineno); + header_printed = 0; + + /* the kernel only profiles up to _etext */ + if (!strcmp(next_name, "_etext") || + !strcmp(next_name, "__etext")) + done = 1; + else { + /* ignore any LEADING (before a '[tT]' symbol + * is found) Absolute symbols and __init_end + * because some architectures place it before + * .text section */ + if ((*mode == 'A' || *mode == '?') + && (total == 0 || !strcmp(next_name, "__init_end"))) + continue; + if (*mode != 'T' && *mode != 't' && + *mode != 'W' && *mode != 'w') + break; /* only text is profiled */ + } + + if (indx >= len / sizeof(*buf)) + errx(EXIT_FAILURE, + _("profile address out of range. Wrong map file?")); + + while (indx < (next_add - add0) / step) { + if (optBins && (buf[indx] || optAll)) { + if (!header_printed) { + printf("%s:\n", fn_name); + header_printed = 1; + } + printf("\t%llx\t%u\n", (indx - 1) * step + add0, + buf[indx]); + } + this += buf[indx++]; + } + total += this; + + if (optBins) { + if (optVerbose || this > 0) + printf(" total\t\t\t\t%u\n", this); + } else if ((this || optAll) && + (fn_len = next_add - fn_add) != 0) { + if (optVerbose) + printf("%016llx %-40s %6u %8.4f\n", fn_add, + fn_name, this, this / (double)fn_len); + else + printf("%6u %-40s %8.4f\n", + this, fn_name, this / (double)fn_len); + if (optSub) { + unsigned long long scan; + + for (scan = (fn_add - add0) / step + 1; + scan < (next_add - add0) / step; + scan++) { + unsigned long long addr; + addr = (scan - 1) * step + add0; + printf("\t%#llx\t%s+%#llx\t%u\n", + addr, fn_name, addr - fn_add, + buf[scan]); + } + } + } + + fn_add = next_add; + strcpy(fn_name, next_name); + + maplineno++; + if (done) + break; + } + + /* clock ticks, out of kernel text - probably modules */ + printf("%6u %s\n", buf[len / sizeof(*buf) - 1], "*unknown*"); + + /* trailer */ + if (optVerbose) + printf("%016x %-40s %6u %8.4f\n", + 0, "total", total, total / (double)(fn_add - add0)); + else + printf("%6u %-40s %8.4f\n", + total, _("total"), total / (double)(fn_add - add0)); + + popenMap ? pclose(map) : fclose(map); + exit(EXIT_SUCCESS); +} diff --git a/sys-utils/renice.1 b/sys-utils/renice.1 new file mode 100644 index 0000000..6b735fa --- /dev/null +++ b/sys-utils/renice.1 @@ -0,0 +1,119 @@ +.\" Copyright (c) 1983, 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)renice.8 8.1 (Berkeley) 6/9/93 +.\" +.TH RENICE "1" "July 2014" "util-linux" "User Commands" +.SH NAME +renice \- alter priority of running processes +.SH SYNOPSIS +.B renice +.RB [ \-n ] +.I priority +.RB [ \-g | \-p | \-u ] +.IR identifier ... +.SH DESCRIPTION +.B renice +alters the scheduling priority of one or more running processes. The +first argument is the \fIpriority\fR value to be used. +The other arguments are interpreted as process IDs (by default), +process group IDs, user IDs, or user names. +.BR renice 'ing +a process group causes all processes in the process group to have their +scheduling priority altered. +.BR renice 'ing +a user causes all processes owned by the user to have their scheduling +priority altered. +.PP +.SH OPTIONS +.TP +.BR \-n , " \-\-priority " \fIpriority\fR +Specify the scheduling +.I priority +to be used for the process, process group, or user. Use of the option +.BR \-n " or " \-\-priority +is optional, but when used it must be the first argument. +.TP +.BR \-g , " \-\-pgrp +Interpret the succeeding arguments as process group IDs. +.TP +.BR \-p , " \-\-pid +Interpret the succeeding arguments as process IDs +(the default). +.TP +.BR \-u , " \-\-user +Interpret the succeeding arguments as usernames or UIDs. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH EXAMPLES +The following command would change the priority of the processes with +PIDs 987 and 32, plus all processes owned by the users daemon and root: +.TP +.B " renice" +1 987 -u daemon root -p 32 +.SH NOTES +Users other than the superuser may only alter the priority of processes they +own. Furthermore, an unprivileged user can only +.I increase +the ``nice value'' (i.e., choose a lower priority) +and such changes are irreversible unless (since Linux 2.6.12) +the user has a suitable ``nice'' resource limit (see +.BR ulimit (1) +and +.BR getrlimit (2)). + +The superuser may alter the priority of any process and set the priority to any +value in the range \-20 to 19. +Useful priorities are: 19 (the affected processes will run only when nothing +else in the system wants to), 0 (the ``base'' scheduling priority), anything +negative (to make things go very fast). +.SH FILES +.TP +.I /etc/passwd +to map user names to user IDs +.SH SEE ALSO +.BR nice (1), +.BR getpriority (2), +.BR setpriority (2), +.BR credentials (7), +.BR sched (7) +.SH HISTORY +The +.B renice +command appeared in 4.0BSD. +.SH AVAILABILITY +The renice command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/renice.c b/sys-utils/renice.c new file mode 100644 index 0000000..3ae71f9 --- /dev/null +++ b/sys-utils/renice.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 1983, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + /* 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + */ + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> + +#include <stdio.h> +#include <pwd.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include "nls.h" +#include "c.h" +#include "closestream.h" + +static const char *idtype[] = { + [PRIO_PROCESS] = N_("process ID"), + [PRIO_PGRP] = N_("process group ID"), + [PRIO_USER] = N_("user ID"), +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %1$s [-n] <priority> [-p|--pid] <pid>...\n" + " %1$s [-n] <priority> -g|--pgrp <pgid>...\n" + " %1$s [-n] <priority> -u|--user <user>...\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Alter the priority of running processes.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -n, --priority <num> specify the nice increment value\n"), out); + fputs(_(" -p, --pid <id> interpret argument as process ID (default)\n"), out); + fputs(_(" -g, --pgrp <id> interpret argument as process group ID\n"), out); + fputs(_(" -u, --user <name>|<id> interpret argument as username or user ID\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + printf(USAGE_MAN_TAIL("renice(1)")); + exit(EXIT_SUCCESS); +} + +static int getprio(const int which, const int who, int *prio) +{ + errno = 0; + *prio = getpriority(which, who); + if (*prio == -1 && errno) { + warn(_("failed to get priority for %d (%s)"), who, idtype[which]); + return -errno; + } + return 0; +} + +static int donice(const int which, const int who, const int prio) +{ + int oldprio, newprio; + + if (getprio(which, who, &oldprio) != 0) + return 1; + if (setpriority(which, who, prio) < 0) { + warn(_("failed to set priority for %d (%s)"), who, idtype[which]); + return 1; + } + if (getprio(which, who, &newprio) != 0) + return 1; + printf(_("%d (%s) old priority %d, new priority %d\n"), + who, idtype[which], oldprio, newprio); + return 0; +} + +/* + * Change the priority (the nice value) of processes + * or groups of processes which are already running. + */ +int main(int argc, char **argv) +{ + int which = PRIO_PROCESS; + int who = 0, prio, errs = 0; + char *endptr = NULL; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + argc--; + argv++; + + if (argc == 1) { + if (strcmp(*argv, "-h") == 0 || + strcmp(*argv, "--help") == 0) + usage(); + + if (strcmp(*argv, "-v") == 0 || + strcmp(*argv, "-V") == 0 || + strcmp(*argv, "--version") == 0) { + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + } + } + + if (*argv && (strcmp(*argv, "-n") == 0 || strcmp(*argv, "--priority") == 0)) { + argc--; + argv++; + } + + if (argc < 2) { + warnx(_("not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + + prio = strtol(*argv, &endptr, 10); + if (*endptr) { + warnx(_("invalid priority '%s'"), *argv); + errtryhelp(EXIT_FAILURE); + } + argc--; + argv++; + + for (; argc > 0; argc--, argv++) { + if (strcmp(*argv, "-g") == 0 || strcmp(*argv, "--pgrp") == 0) { + which = PRIO_PGRP; + continue; + } + if (strcmp(*argv, "-u") == 0 || strcmp(*argv, "--user") == 0) { + which = PRIO_USER; + continue; + } + if (strcmp(*argv, "-p") == 0 || strcmp(*argv, "--pid") == 0) { + which = PRIO_PROCESS; + continue; + } + if (which == PRIO_USER) { + struct passwd *pwd = getpwnam(*argv); + + if (pwd != NULL) + who = pwd->pw_uid; + else + who = strtol(*argv, &endptr, 10); + if (who < 0 || *endptr) { + warnx(_("unknown user %s"), *argv); + errs = 1; + continue; + } + } else { + who = strtol(*argv, &endptr, 10); + if (who < 0 || *endptr) { + /* TRANSLATORS: The first %s is one of the above + * three ID names. Read: "bad value for %s: %s" */ + warnx(_("bad %s value: %s"), idtype[which], *argv); + errs = 1; + continue; + } + } + errs |= donice(which, who, prio); + } + return errs != 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/sys-utils/rfkill.8 b/sys-utils/rfkill.8 new file mode 100644 index 0000000..9eff913 --- /dev/null +++ b/sys-utils/rfkill.8 @@ -0,0 +1,120 @@ +.\" -*- nroff -*- +.TH RFKILL "8" "2017-07-06" "util-linux" "System Administration" +.SH NAME +rfkill \- tool for enabling and disabling wireless devices +.SH SYNOPSIS +.B rfkill +.RI [ options ] +.RI [ command ] +.RI [ id|type \ ...] + +.SH DESCRIPTION +.B rfkill +lists, enabling and disabling wireless devices. + +The command "list" output format is deprecated and maintained for backward +compatibility only. The new output format is the default when no command is +specified or when the option \fB\-\-output\fR is used. + +The default output is subject to change. So whenever possible, you should +avoid using default outputs in your scripts. Always explicitly define expected +columns by using the \fB\-\-output\fR option together with a columns list in +environments where a stable output is required. + + +.SH OPTIONS +.TP +\fB\-J\fR, \fB\-\-json\fR +Use JSON output format. +.TP +\fB\-n\fR, \fB\-\-noheadings\fR +Do not print a header line. +.TP +\fB\-o\fR, \fB\-\-output\fR +Specify which output columns to print. Use \-\-help to get a list of +available columns. +.TP +.B \-\-output\-all +Output all available columns. +.TP +\fB\-r\fR, \fB\-\-raw\fR +Use the raw output format. +.TP +.B \-\-help +Display help text and exit. +.TP +.B \-\-version +Display version information and exit. +.SH COMMANDS +.TP +.B help +Display help text and exit. +.TP +.B event +Listen for rfkill events and display them on stdout. +.TP +\fBlist \fR[\fIid\fR|\fItype\fR ...] +List the current state of all available devices. The command output format is deprecated, see the section DESCRIPTION. +It is a good idea to check with +.B list +command +.IR id " or " type +scope is appropriate before setting +.BR block " or " unblock . +Special +.I all +type string will match everything. Use of multiple +.IR id " or " type +arguments is supported. +.TP +\fBblock \fBid\fR|\fBtype\fR [...] +Disable the corresponding device. +.TP +\fBunblock \fBid\fR|\fBtype\fR [...] +Enable the corresponding device. If the device is hard\-blocked, for example +via a hardware switch, it will remain unavailable though it is now +soft\-unblocked. +.SH EXAMPLES +rfkill --output ID,TYPE +.br +rfkill block all +.br +rfkill unblock wlan +.br +rfkill block bluetooth uwb wimax wwan gps fm nfc +.SH AUTHORS +.B rfkill +was originally written by +.MT johannes@\:sipsolutions.\:net +Johannes Berg +.ME +and +.MT marcel@\:holtmann.\:org +Marcel Holtmann +.ME . +The code has been later modified by +.MT kerolasa@\:iki.\:fi +Sami Kerola +.ME +and +.MT kzak@\:redhat.\:com +Karel Zak +.ME +for util-linux project. +.PP +This manual page was written by +.MT linux@\:youmustbejoking.\:demon.\:co.uk +Darren Salt +.ME , +for the Debian project (and may be used by others). +.SH "SEE ALSO" +.BR powertop (8), +.BR systemd-rfkill (8), +.UR https://\:git.\:kernel.\:org/\:pub/\:scm/\:linux/\:kernel/\:git/\:torvalds/\:linux.git/\:tree/\:Documentation/\:rfkill.txt +Linux kernel documentation +.UE +.SH AVAILABILITY +The rfkill command is part of the util\-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util\-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/rfkill.c b/sys-utils/rfkill.c new file mode 100644 index 0000000..a93e8ba --- /dev/null +++ b/sys-utils/rfkill.c @@ -0,0 +1,751 @@ +/* + * /dev/rfkill userspace tool + * + * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright 2009 Tim Gardner <tim.gardner@canonical.com> + * Copyright 2017 Sami Kerola <kerolasa@iki.fi> + * Copyright (C) 2017 Karel Zak <kzak@redhat.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <ctype.h> +#include <getopt.h> +#include <libsmartcols.h> +#include <linux/rfkill.h> +#include <poll.h> +#include <sys/syslog.h> +#include <sys/time.h> + +#include "c.h" +#include "closestream.h" +#include "nls.h" +#include "optutils.h" +#include "pathnames.h" +#include "strutils.h" +#include "timeutils.h" +#include "widechar.h" +#include "xalloc.h" + + +/* + * NFC supported by kernel since v3.10 (year 2013); FM and another types are from + * year 2009 (2.6.33) or older. + */ +#ifndef RFKILL_TYPE_NFC +# ifndef RFKILL_TYPE_FM +# define RFKILL_TYPE_FM RFKILL_TYPE_GPS + 1 +# endif +# define RFKILL_TYPE_NFC RFKILL_TYPE_FM + 1 +# undef NUM_RFKILL_TYPES +# define NUM_RFKILL_TYPES RFKILL_TYPE_NFC + 1 +#endif + +struct rfkill_type_str { + enum rfkill_type type; /* ID */ + const char *name; /* generic name */ + const char *desc; /* human readable name */ +}; + +static const struct rfkill_type_str rfkill_type_strings[] = { + { .type = RFKILL_TYPE_ALL, .name = "all" }, + { .type = RFKILL_TYPE_WLAN, .name = "wlan", .desc = "Wireless LAN" }, + { .type = RFKILL_TYPE_WLAN, .name = "wifi" }, /* alias */ + { .type = RFKILL_TYPE_BLUETOOTH, .name = "bluetooth", .desc = "Bluetooth" }, + { .type = RFKILL_TYPE_UWB, .name = "uwb", .desc = "Ultra-Wideband" }, + { .type = RFKILL_TYPE_UWB, .name = "ultrawideband" }, /* alias */ + { .type = RFKILL_TYPE_WIMAX, .name = "wimax", .desc = "WiMAX" }, + { .type = RFKILL_TYPE_WWAN, .name = "wwan", .desc = "Wireless WAN" }, + { .type = RFKILL_TYPE_GPS, .name = "gps", .desc = "GPS" }, + { .type = RFKILL_TYPE_FM, .name = "fm", .desc = "FM" }, + { .type = RFKILL_TYPE_NFC, .name = "nfc", .desc = "NFC" }, + { .type = NUM_RFKILL_TYPES, .name = NULL } +}; + +struct rfkill_id { + union { + enum rfkill_type type; + uint32_t index; + }; + enum { + RFKILL_IS_INVALID, + RFKILL_IS_TYPE, + RFKILL_IS_INDEX, + RFKILL_IS_ALL + } result; +}; + +/* supported actions */ +enum { + ACT_LIST, + ACT_HELP, + ACT_EVENT, + ACT_BLOCK, + ACT_UNBLOCK, + + ACT_LIST_OLD +}; + +static char *rfkill_actions[] = { + [ACT_LIST] = "list", + [ACT_HELP] = "help", + [ACT_EVENT] = "event", + [ACT_BLOCK] = "block", + [ACT_UNBLOCK] = "unblock" +}; + +/* column IDs */ +enum { + COL_DEVICE, + COL_ID, + COL_TYPE, + COL_DESC, + COL_SOFT, + COL_HARD +}; + +/* column names */ +struct colinfo { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; +}; + +/* columns descriptions */ +static const struct colinfo infos[] = { + [COL_DEVICE] = {"DEVICE", 0, 0, N_("kernel device name")}, + [COL_ID] = {"ID", 2, SCOLS_FL_RIGHT, N_("device identifier value")}, + [COL_TYPE] = {"TYPE", 0, 0, N_("device type name that can be used as identifier")}, + [COL_DESC] = {"TYPE-DESC", 0, 0, N_("device type description")}, + [COL_SOFT] = {"SOFT", 0, SCOLS_FL_RIGHT, N_("status of software block")}, + [COL_HARD] = {"HARD", 0, SCOLS_FL_RIGHT, N_("status of hardware block")} +}; + +static int columns[ARRAY_SIZE(infos) * 2]; +static size_t ncolumns; + +struct control { + struct libscols_table *tb; + unsigned int + json:1, + no_headings:1, + raw:1; +}; + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + assert(name); + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static int get_column_id(size_t num) +{ + assert(num < ncolumns); + assert(columns[num] < (int)ARRAY_SIZE(infos)); + return columns[num]; +} + +static const struct colinfo *get_column_info(int num) +{ + return &infos[get_column_id(num)]; +} + +static int string_to_action(const char *str) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(rfkill_actions); i++) + if (strcmp(str, rfkill_actions[i]) == 0) + return i; + + return -EINVAL; +} + +static int rfkill_ro_open(int nonblock) +{ + int fd; + + fd = open(_PATH_DEV_RFKILL, O_RDONLY); + if (fd < 0) { + warn(_("cannot open %s"), _PATH_DEV_RFKILL); + return -errno; + } + + if (nonblock && fcntl(fd, F_SETFL, O_NONBLOCK) < 0) { + warn(_("cannot set non-blocking %s"), _PATH_DEV_RFKILL); + close(fd); + return -errno; + } + + return fd; +} + +/* returns: 0 success, 1 read again, < 0 error */ +static int rfkill_read_event(int fd, struct rfkill_event *event) +{ + ssize_t len = read(fd, event, sizeof(*event)); + + if (len < 0) { + if (errno == EAGAIN) + return 1; + warn(_("cannot read %s"), _PATH_DEV_RFKILL); + return -errno; + } + + if (len < RFKILL_EVENT_SIZE_V1) { + warnx(_("wrong size of rfkill event: %zu < %d"), len, RFKILL_EVENT_SIZE_V1); + return 1; + } + + return 0; +} + + +static int rfkill_event(void) +{ + struct rfkill_event event; + struct timeval tv; + char date_buf[ISO_BUFSIZ]; + struct pollfd p; + int fd, n; + + fd = rfkill_ro_open(0); + if (fd < 0) + return -errno; + + memset(&p, 0, sizeof(p)); + p.fd = fd; + p.events = POLLIN | POLLHUP; + + /* interrupted by signal only */ + while (1) { + int rc = 1; /* recover-able error */ + + n = poll(&p, 1, -1); + if (n < 0) { + warn(_("failed to poll %s"), _PATH_DEV_RFKILL); + goto failed; + } + + if (n) + rc = rfkill_read_event(fd, &event); + if (rc < 0) + goto failed; + if (rc) + continue; + + gettimeofday(&tv, NULL); + strtimeval_iso(&tv, ISO_TIMESTAMP_COMMA, date_buf, + sizeof(date_buf)); + printf("%s: idx %u type %u op %u soft %u hard %u\n", + date_buf, + event.idx, event.type, event.op, event.soft, event.hard); + fflush(stdout); + } + +failed: + close(fd); + return -1; +} + +static const char *get_sys_attr(uint32_t idx, const char *attr) +{ + static char name[128]; + char path[PATH_MAX]; + FILE *f; + char *p; + + snprintf(path, sizeof(path), _PATH_SYS_RFKILL "/rfkill%u/%s", idx, attr); + f = fopen(path, "r"); + if (!f) + goto done; + if (!fgets(name, sizeof(name), f)) + goto done; + p = strchr(name, '\n'); + if (p) + *p = '\0'; +done: + if (f) + fclose(f); + return name; +} + +static struct rfkill_id rfkill_id_to_type(const char *s) +{ + const struct rfkill_type_str *p; + struct rfkill_id ret; + + if (islower(*s)) { + for (p = rfkill_type_strings; p->name != NULL; p++) { + if (!strcmp(s, p->name)) { + ret.type = p->type; + if (!strcmp(s, "all")) + ret.result = RFKILL_IS_ALL; + else + ret.result = RFKILL_IS_TYPE; + return ret; + } + } + } else if (isdigit(*s)) { + /* assume a numeric character implies an index. */ + char filename[64]; + + ret.index = strtou32_or_err(s, _("invalid identifier")); + snprintf(filename, sizeof(filename) - 1, + _PATH_SYS_RFKILL "/rfkill%" PRIu32 "/name", ret.index); + if (access(filename, F_OK) == 0) + ret.result = RFKILL_IS_INDEX; + else + ret.result = RFKILL_IS_INVALID; + return ret; + } + + ret.result = RFKILL_IS_INVALID; + return ret; +} + +static const char *rfkill_type_to_desc(enum rfkill_type type) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(rfkill_type_strings); i++) { + if (type == rfkill_type_strings[i].type) + return rfkill_type_strings[i].desc; + } + + return NULL; +} + + +static int event_match(struct rfkill_event *event, struct rfkill_id *id) +{ + if (event->op != RFKILL_OP_ADD) + return 0; + + /* filter out unwanted results */ + switch (id->result) { + case RFKILL_IS_TYPE: + if (event->type != id->type) + return 0; + break; + case RFKILL_IS_INDEX: + if (event->idx != id->index) + return 0; + break; + case RFKILL_IS_ALL: + break; + default: + abort(); + } + + return 1; +} + +static void fill_table_row(struct libscols_table *tb, struct rfkill_event *event) +{ + static struct libscols_line *ln; + size_t i; + + assert(tb); + + ln = scols_table_new_line(tb, NULL); + if (!ln) { + errno = ENOMEM; + errx(EXIT_FAILURE, _("failed to allocate output line")); + } + + for (i = 0; i < (size_t)ncolumns; i++) { + char *str = NULL; + switch (get_column_id(i)) { + case COL_DEVICE: + str = xstrdup(get_sys_attr(event->idx, "name")); + break; + case COL_ID: + xasprintf(&str, "%" PRIu32, event->idx); + break; + case COL_TYPE: + str = xstrdup(get_sys_attr(event->idx, "type")); + break; + case COL_DESC: + str = xstrdup(rfkill_type_to_desc(event->type)); + break; + case COL_SOFT: + str = xstrdup(event->soft ? _("blocked") : _("unblocked")); + break; + case COL_HARD: + str = xstrdup(event->hard ? _("blocked") : _("unblocked")); + break; + default: + abort(); + } + if (str && scols_line_refer_data(ln, i, str)) + errx(EXIT_FAILURE, _("failed to add output data")); + } +} + +static int rfkill_list_old(const char *param) +{ + struct rfkill_id id = { .result = RFKILL_IS_ALL }; + struct rfkill_event event; + int fd, rc = 0; + + if (param) { + id = rfkill_id_to_type(param); + if (id.result == RFKILL_IS_INVALID) { + warnx(_("invalid identifier: %s"), param); + return -EINVAL; + } + } + + fd = rfkill_ro_open(1); + + while (1) { + rc = rfkill_read_event(fd, &event); + if (rc < 0) + break; + if (rc == 1 && errno == EAGAIN) { + rc = 0; /* done */ + break; + } + if (rc == 0 && event_match(&event, &id)) { + char *name = xstrdup(get_sys_attr(event.idx, "name")), + *type = xstrdup(rfkill_type_to_desc(event.type)); + + if (!type) + type = xstrdup(get_sys_attr(event.idx, "type")); + + printf("%u: %s: %s\n", event.idx, name, type); + printf("\tSoft blocked: %s\n", event.soft ? "yes" : "no"); + printf("\tHard blocked: %s\n", event.hard ? "yes" : "no"); + + free(name); + free(type); + } + } + close(fd); + return rc; +} + +static void rfkill_list_init(struct control *ctrl) +{ + size_t i; + + scols_init_debug(0); + + ctrl->tb = scols_new_table(); + if (!ctrl->tb) + err(EXIT_FAILURE, _("failed to allocate output table")); + + scols_table_enable_json(ctrl->tb, ctrl->json); + scols_table_enable_noheadings(ctrl->tb, ctrl->no_headings); + scols_table_enable_raw(ctrl->tb, ctrl->raw); + + for (i = 0; i < (size_t) ncolumns; i++) { + const struct colinfo *col = get_column_info(i); + struct libscols_column *cl; + + cl = scols_table_new_column(ctrl->tb, col->name, col->whint, col->flags); + if (!cl) + err(EXIT_FAILURE, _("failed to allocate output column")); + if (ctrl->json) { + int id = get_column_id(i); + if (id == COL_ID) + scols_column_set_json_type(cl, SCOLS_JSON_NUMBER); + } + } +} + +static int rfkill_list_fill(struct control const *ctrl, const char *param) +{ + struct rfkill_id id = { .result = RFKILL_IS_ALL }; + struct rfkill_event event; + int fd, rc = 0; + + if (param) { + id = rfkill_id_to_type(param); + if (id.result == RFKILL_IS_INVALID) { + warnx(_("invalid identifier: %s"), param); + return -EINVAL; + } + } + + fd = rfkill_ro_open(1); + + while (1) { + rc = rfkill_read_event(fd, &event); + if (rc < 0) + break; + if (rc == 1 && errno == EAGAIN) { + rc = 0; /* done */ + break; + } + if (rc == 0 && event_match(&event, &id)) + fill_table_row(ctrl->tb, &event); + } + close(fd); + return rc; +} + +static void rfkill_list_output(struct control const *ctrl) +{ + scols_print_table(ctrl->tb); + scols_unref_table(ctrl->tb); +} + +static int rfkill_block(uint8_t block, const char *param) +{ + struct rfkill_id id; + struct rfkill_event event = { + .op = RFKILL_OP_CHANGE_ALL, + .soft = block, + 0 + }; + ssize_t len; + int fd; + char *message = NULL; + + id = rfkill_id_to_type(param); + + switch (id.result) { + case RFKILL_IS_INVALID: + warnx(_("invalid identifier: %s"), param); + return -1; + case RFKILL_IS_TYPE: + event.type = id.type; + xasprintf(&message, "type %s", param); + break; + case RFKILL_IS_INDEX: + event.op = RFKILL_OP_CHANGE; + event.idx = id.index; + xasprintf(&message, "id %d", id.index); + break; + case RFKILL_IS_ALL: + message = xstrdup("all"); + break; + default: + abort(); + } + + fd = open(_PATH_DEV_RFKILL, O_RDWR); + if (fd < 0) { + warn(_("cannot open %s"), _PATH_DEV_RFKILL); + free(message); + return -errno; + } + + len = write(fd, &event, sizeof(event)); + if (len < 0) + warn(_("write failed: %s"), _PATH_DEV_RFKILL); + else { + openlog("rfkill", 0, LOG_USER); + syslog(LOG_NOTICE, "%s set for %s", block ? "block" : "unblock", message); + closelog(); + } + free(message); + return close(fd); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + size_t i; + + fputs(USAGE_HEADER, stdout); + fprintf(stdout, _(" %s [options] command [identifier ...]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, stdout); + fputs(_("Tool for enabling and disabling wireless devices.\n"), stdout); + + fputs(USAGE_OPTIONS, stdout); + fputs(_(" -J, --json use JSON output format\n"), stdout); + fputs(_(" -n, --noheadings don't print headings\n"), stdout); + fputs(_(" -o, --output <list> define which output columns to use\n"), stdout); + fputs(_(" --output-all output all columns\n"), stdout); + fputs(_(" -r, --raw use the raw output format\n"), stdout); + + fputs(USAGE_SEPARATOR, stdout); + printf(USAGE_HELP_OPTIONS(24)); + + fputs(USAGE_COLUMNS, stdout); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(stdout, " %-10s %s\n", infos[i].name, _(infos[i].help)); + + fputs(USAGE_COMMANDS, stdout); + + /* + * TRANSLATORS: command names should not be translated, explaining + * them as additional field after identifier is fine, for example + * + * list [identifier] (lista [tarkenne]) + */ + fputs(_(" help\n"), stdout); + fputs(_(" event\n"), stdout); + fputs(_(" list [identifier]\n"), stdout); + fputs(_(" block identifier\n"), stdout); + fputs(_(" unblock identifier\n"), stdout); + + fprintf(stdout, USAGE_MAN_TAIL("rfkill(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + struct control ctrl = { 0 }; + int c, act = ACT_LIST, list_all = 0; + char *outarg = NULL; + enum { + OPT_LIST_TYPES = CHAR_MAX + 1 + }; + static const struct option longopts[] = { + { "json", no_argument, NULL, 'J' }, + { "noheadings", no_argument, NULL, 'n' }, + { "output", required_argument, NULL, 'o' }, + { "output-all", no_argument, NULL, OPT_LIST_TYPES }, + { "raw", no_argument, NULL, 'r' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { NULL, 0, NULL, 0 } + }; + static const ul_excl_t excl[] = { + {'J', 'r'}, + {0} + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + int ret = 0; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "Jno:rVh", longopts, NULL)) != -1) { + err_exclusive_options(c, longopts, excl, excl_st); + switch (c) { + case 'J': + ctrl.json = 1; + break; + case 'n': + ctrl.no_headings = 1; + break; + case 'o': + outarg = optarg; + break; + case OPT_LIST_TYPES: + list_all = 1; + break; + case 'r': + ctrl.raw = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + argc -= optind; + argv += optind; + + if (argc > 0) { + act = string_to_action(*argv); + if (act < 0) + errtryhelp(EXIT_FAILURE); + argv++; + argc--; + + /* + * For backward compatibility we use old output format if + * "list" explicitly specified and--output not defined. + */ + if (!outarg && act == ACT_LIST) + act = ACT_LIST_OLD; + } + + switch (act) { + case ACT_LIST_OLD: + /* Deprecated in favour of ACT_LIST */ + if (!argc) + ret |= rfkill_list_old(NULL); /* ALL */ + else while (argc) { + ret |= rfkill_list_old(*argv); + argc--; + argv++; + } + break; + + case ACT_LIST: + columns[ncolumns++] = COL_ID; + columns[ncolumns++] = COL_TYPE; + columns[ncolumns++] = COL_DEVICE; + if (list_all) + columns[ncolumns++] = COL_DESC; + columns[ncolumns++] = COL_SOFT; + columns[ncolumns++] = COL_HARD; + + if (outarg + && string_add_to_idarray(outarg, columns, + ARRAY_SIZE(columns), &ncolumns, + column_name_to_id) < 0) + return EXIT_FAILURE; + + rfkill_list_init(&ctrl); + if (!argc) + ret |= rfkill_list_fill(&ctrl, NULL); /* ALL */ + else while (argc) { + ret |= rfkill_list_fill(&ctrl, *argv); + argc--; + argv++; + } + rfkill_list_output(&ctrl); + break; + + case ACT_EVENT: + ret = rfkill_event(); + break; + + case ACT_HELP: + usage(); + break; + + case ACT_BLOCK: + while (argc) { + ret |= rfkill_block(1, *argv); + argc--; + argv++; + } + break; + + case ACT_UNBLOCK: + while (argc) { + ret |= rfkill_block(0, *argv); + argv++; + argc--; + } + break; + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/sys-utils/rtcwake.8 b/sys-utils/rtcwake.8 new file mode 100644 index 0000000..4a5f8d7 --- /dev/null +++ b/sys-utils/rtcwake.8 @@ -0,0 +1,189 @@ +.\" Copyright (c) 2007, SUSE LINUX Products GmbH +.\" Bernhard Walle <bwalle@suse.de> +.\" +.\" This program is free software; you can redistribute it and/or +.\" modify it under the terms of the GNU General Public License +.\" as published by the Free Software Foundation; either version 2 +.\" of the License, or (at your option) any later version. +.\" +.\" This program is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License +.\" along with this program; if not, write to the Free Software +.\" Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +.\" 02110-1301, USA. +.\" +.TH RTCWAKE 8 "June 2015" "util-linux" "System Administration" +.SH NAME +rtcwake \- enter a system sleep state until specified wakeup time +.SH SYNOPSIS +.B rtcwake +[options] +.RB [ \-d +.IR device ] +.RB [ \-m +.IR standby_mode ] +.RB { "\-s \fIseconds\fP" | "\-t \fItime_t\fP" } +.SH DESCRIPTION +This program is used to enter a system sleep state and to automatically +wake from it at a specified time. +.PP +This uses cross-platform Linux interfaces to enter a system sleep state, and +leave it no later than a specified time. It uses any RTC framework driver that +supports standard driver model wakeup flags. +.PP +This is normally used like the old \fBapmsleep\fP utility, to wake from a suspend +state like ACPI S1 (standby) or S3 (suspend-to-RAM). Most platforms can +implement those without analogues of BIOS, APM, or ACPI. +.PP +On some systems, this can also be used like \fBnvram-wakeup\fP, waking from states +like ACPI S4 (suspend to disk). Not all systems have persistent media that are +appropriate for such suspend modes. +.PP +Note that alarm functionality depends on hardware; not every RTC is able to setup +an alarm up to 24 hours in the future. +.PP +The suspend setup maybe be interrupted by active hardware; for example wireless USB +input devices that continue to send events for some fraction of a second after the +return key is pressed. +.B rtcwake +tries to avoid this problem and it waits to terminal to settle down before +entering a system sleep. + +.SH OPTIONS +.TP +.BR \-A , " \-\-adjfile " \fIfile +Specify an alternative path to the adjust file. +.TP +.BR \-a , " \-\-auto" +Read the clock mode (whether the hardware clock is set to UTC or local time) +from the \fIadjtime\fP file, where +.BR hwclock (8) +stores that information. This is the default. +.TP +.BR \-\-date " \fItimestamp" +Set the wakeup time to the value of the timestamp. Format of the +timestamp can be any of the following: +.TS +tab(|); +l2 l. +YYYYMMDDhhmmss +YYYY-MM-DD hh:mm:ss +YYYY-MM-DD hh:mm|(seconds will be set to 00) +YYYY-MM-DD|(time will be set to 00:00:00) +hh:mm:ss|(date will be set to today) +hh:mm|(date will be set to today, seconds to 00) +tomorrow|(time is set to 00:00:00) ++5min +.TE +.TP +.BR \-d , " \-\-device " \fIdevice +Use the specified \fIdevice\fP instead of \fBrtc0\fP as realtime clock. +This option is only relevant if your system has more than one RTC. +You may specify \fBrtc1\fP, \fBrtc2\fP, ... here. +.TP +.BR \-l , " \-\-local" +Assume that the hardware clock is set to local time, regardless of the +contents of the \fIadjtime\fP file. +.TP +.B \-\-list\-modes +List available \-\-mode option arguments. +.TP +.BR \-m , " \-\-mode " \fImode +Go into the given standby state. Valid values for \fImode\fP are: +.RS +.TP +.B standby +ACPI state S1. This state offers minimal, though real, power savings, while +providing a very low-latency transition back to a working system. This is the +default mode. +.TP +.B freeze +The processes are frozen, all the devices are suspended and all the processors +idled. This state is a general state that does not need any platform-specific +support, but it saves less power than Suspend-to-RAM, because the system is +still in a running state. (Available since Linux 3.9.) +.TP +.B mem +ACPI state S3 (Suspend-to-RAM). This state offers significant power savings as +everything in the system is put into a low-power state, except for memory, +which is placed in self-refresh mode to retain its contents. +.TP +.B disk +ACPI state S4 (Suspend-to-disk). This state offers the greatest power savings, +and can be used even in the absence of low-level platform support for power +management. This state operates similarly to Suspend-to-RAM, but includes a +final step of writing memory contents to disk. +.TP +.B off +ACPI state S5 (Poweroff). This is done by calling '/sbin/shutdown'. +Not officially supported by ACPI, but it usually works. +.TP +.B no +Don't suspend, only set the RTC wakeup time. +.TP +.B on +Don't suspend, but read the RTC device until an alarm time appears. +This mode is useful for debugging. +.TP +.B disable +Disable a previously set alarm. +.TP +.B show +Print alarm information in format: "alarm: off|on <time>". +The time is in ctime() output format, e.g. "alarm: on Tue Nov 16 04:48:45 2010". +.RE +.TP +.BR \-n , " \-\-dry-run" +This option does everything apart from actually setting up the alarm, +suspending the system, or waiting for the alarm. +.TP +.BR \-s , " \-\-seconds " \fIseconds +Set the wakeup time to \fIseconds\fP in the future from now. +.TP +.BR \-t , " \-\-time " \fItime_t +Set the wakeup time to the absolute time \fItime_t\fP. \fItime_t\fP +is the time in seconds since 1970-01-01, 00:00 UTC. Use the +.BR date (1) +tool to convert between human-readable time and \fItime_t\fP. +.TP +.BR \-u , " \-\-utc" +Assume that the hardware clock is set to UTC (Universal Time Coordinated), +regardless of the contents of the \fIadjtime\fP file. +.TP +.BR \-v , " \-\-verbose" +Be verbose. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH NOTES +Some PC systems can't currently exit sleep states such as \fBmem\fP +using only the kernel code accessed by this driver. +They need help from userspace code to make the framebuffer work again. +.SH FILES +.I /etc/adjtime +.SH HISTORY +The program was posted several times on LKML and other lists +before appearing in kernel commit message for Linux 2.6 in the GIT +commit 87ac84f42a7a580d0dd72ae31d6a5eb4bfe04c6d. +.SH AUTHORS +The program was written by David Brownell <dbrownell@users.sourceforge.net> +and improved by Bernhard Walle <bwalle@suse.de>. +.SH COPYRIGHT +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License <http://www.gnu.org/licenses/gpl.html>. +There is NO WARRANTY, to the extent permitted by law. +.SH "SEE ALSO" +.BR hwclock (8), +.BR date (1) +.SH AVAILABILITY +The rtcwake command is part of the util-linux package and is available from the +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/rtcwake.8.in b/sys-utils/rtcwake.8.in new file mode 100644 index 0000000..167f7f9 --- /dev/null +++ b/sys-utils/rtcwake.8.in @@ -0,0 +1,189 @@ +.\" Copyright (c) 2007, SUSE LINUX Products GmbH +.\" Bernhard Walle <bwalle@suse.de> +.\" +.\" This program is free software; you can redistribute it and/or +.\" modify it under the terms of the GNU General Public License +.\" as published by the Free Software Foundation; either version 2 +.\" of the License, or (at your option) any later version. +.\" +.\" This program is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License +.\" along with this program; if not, write to the Free Software +.\" Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +.\" 02110-1301, USA. +.\" +.TH RTCWAKE 8 "June 2015" "util-linux" "System Administration" +.SH NAME +rtcwake \- enter a system sleep state until specified wakeup time +.SH SYNOPSIS +.B rtcwake +[options] +.RB [ \-d +.IR device ] +.RB [ \-m +.IR standby_mode ] +.RB { "\-s \fIseconds\fP" | "\-t \fItime_t\fP" } +.SH DESCRIPTION +This program is used to enter a system sleep state and to automatically +wake from it at a specified time. +.PP +This uses cross-platform Linux interfaces to enter a system sleep state, and +leave it no later than a specified time. It uses any RTC framework driver that +supports standard driver model wakeup flags. +.PP +This is normally used like the old \fBapmsleep\fP utility, to wake from a suspend +state like ACPI S1 (standby) or S3 (suspend-to-RAM). Most platforms can +implement those without analogues of BIOS, APM, or ACPI. +.PP +On some systems, this can also be used like \fBnvram-wakeup\fP, waking from states +like ACPI S4 (suspend to disk). Not all systems have persistent media that are +appropriate for such suspend modes. +.PP +Note that alarm functionality depends on hardware; not every RTC is able to setup +an alarm up to 24 hours in the future. +.PP +The suspend setup maybe be interrupted by active hardware; for example wireless USB +input devices that continue to send events for some fraction of a second after the +return key is pressed. +.B rtcwake +tries to avoid this problem and it waits to terminal to settle down before +entering a system sleep. + +.SH OPTIONS +.TP +.BR \-A , " \-\-adjfile " \fIfile +Specify an alternative path to the adjust file. +.TP +.BR \-a , " \-\-auto" +Read the clock mode (whether the hardware clock is set to UTC or local time) +from the \fIadjtime\fP file, where +.BR hwclock (8) +stores that information. This is the default. +.TP +.BR \-\-date " \fItimestamp" +Set the wakeup time to the value of the timestamp. Format of the +timestamp can be any of the following: +.TS +tab(|); +l2 l. +YYYYMMDDhhmmss +YYYY-MM-DD hh:mm:ss +YYYY-MM-DD hh:mm|(seconds will be set to 00) +YYYY-MM-DD|(time will be set to 00:00:00) +hh:mm:ss|(date will be set to today) +hh:mm|(date will be set to today, seconds to 00) +tomorrow|(time is set to 00:00:00) ++5min +.TE +.TP +.BR \-d , " \-\-device " \fIdevice +Use the specified \fIdevice\fP instead of \fBrtc0\fP as realtime clock. +This option is only relevant if your system has more than one RTC. +You may specify \fBrtc1\fP, \fBrtc2\fP, ... here. +.TP +.BR \-l , " \-\-local" +Assume that the hardware clock is set to local time, regardless of the +contents of the \fIadjtime\fP file. +.TP +.B \-\-list\-modes +List available \-\-mode option arguments. +.TP +.BR \-m , " \-\-mode " \fImode +Go into the given standby state. Valid values for \fImode\fP are: +.RS +.TP +.B standby +ACPI state S1. This state offers minimal, though real, power savings, while +providing a very low-latency transition back to a working system. This is the +default mode. +.TP +.B freeze +The processes are frozen, all the devices are suspended and all the processors +idled. This state is a general state that does not need any platform-specific +support, but it saves less power than Suspend-to-RAM, because the system is +still in a running state. (Available since Linux 3.9.) +.TP +.B mem +ACPI state S3 (Suspend-to-RAM). This state offers significant power savings as +everything in the system is put into a low-power state, except for memory, +which is placed in self-refresh mode to retain its contents. +.TP +.B disk +ACPI state S4 (Suspend-to-disk). This state offers the greatest power savings, +and can be used even in the absence of low-level platform support for power +management. This state operates similarly to Suspend-to-RAM, but includes a +final step of writing memory contents to disk. +.TP +.B off +ACPI state S5 (Poweroff). This is done by calling '/sbin/shutdown'. +Not officially supported by ACPI, but it usually works. +.TP +.B no +Don't suspend, only set the RTC wakeup time. +.TP +.B on +Don't suspend, but read the RTC device until an alarm time appears. +This mode is useful for debugging. +.TP +.B disable +Disable a previously set alarm. +.TP +.B show +Print alarm information in format: "alarm: off|on <time>". +The time is in ctime() output format, e.g. "alarm: on Tue Nov 16 04:48:45 2010". +.RE +.TP +.BR \-n , " \-\-dry-run" +This option does everything apart from actually setting up the alarm, +suspending the system, or waiting for the alarm. +.TP +.BR \-s , " \-\-seconds " \fIseconds +Set the wakeup time to \fIseconds\fP in the future from now. +.TP +.BR \-t , " \-\-time " \fItime_t +Set the wakeup time to the absolute time \fItime_t\fP. \fItime_t\fP +is the time in seconds since 1970-01-01, 00:00 UTC. Use the +.BR date (1) +tool to convert between human-readable time and \fItime_t\fP. +.TP +.BR \-u , " \-\-utc" +Assume that the hardware clock is set to UTC (Universal Time Coordinated), +regardless of the contents of the \fIadjtime\fP file. +.TP +.BR \-v , " \-\-verbose" +Be verbose. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH NOTES +Some PC systems can't currently exit sleep states such as \fBmem\fP +using only the kernel code accessed by this driver. +They need help from userspace code to make the framebuffer work again. +.SH FILES +.I @ADJTIME_PATH@ +.SH HISTORY +The program was posted several times on LKML and other lists +before appearing in kernel commit message for Linux 2.6 in the GIT +commit 87ac84f42a7a580d0dd72ae31d6a5eb4bfe04c6d. +.SH AUTHORS +The program was written by David Brownell <dbrownell@users.sourceforge.net> +and improved by Bernhard Walle <bwalle@suse.de>. +.SH COPYRIGHT +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License <http://www.gnu.org/licenses/gpl.html>. +There is NO WARRANTY, to the extent permitted by law. +.SH "SEE ALSO" +.BR hwclock (8), +.BR date (1) +.SH AVAILABILITY +The rtcwake command is part of the util-linux package and is available from the +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/rtcwake.c b/sys-utils/rtcwake.c new file mode 100644 index 0000000..b63c646 --- /dev/null +++ b/sys-utils/rtcwake.c @@ -0,0 +1,655 @@ +/* + * rtcwake -- enter a system sleep state until specified wakeup time. + * + * This uses cross-platform Linux interfaces to enter a system sleep state, + * and leave it no later than a specified time. It uses any RTC framework + * driver that supports standard driver model wakeup flags. + * + * This is normally used like the old "apmsleep" utility, to wake from a + * suspend state like ACPI S1 (standby) or S3 (suspend-to-RAM). Most + * platforms can implement those without analogues of BIOS, APM, or ACPI. + * + * On some systems, this can also be used like "nvram-wakeup", waking + * from states like ACPI S4 (suspend to disk). Not all systems have + * persistent media that are appropriate for such suspend modes. + * + * The best way to set the system's RTC is so that it holds the current + * time in UTC. Use the "-l" flag to tell this program that the system + * RTC uses a local timezone instead (maybe you dual-boot MS-Windows). + * That flag should not be needed on systems with adjtime support. + */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/rtc.h> +#include <poll.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <sys/types.h> +#include <termios.h> +#include <time.h> +#include <unistd.h> + +#include "c.h" +#include "closestream.h" +#include "env.h" +#include "nls.h" +#include "optutils.h" +#include "pathnames.h" +#include "strutils.h" +#include "strv.h" +#include "timeutils.h" +#include "xalloc.h" + +#ifndef RTC_AF +# define RTC_AF 0x20 /* Alarm interrupt */ +#endif + +#define ADJTIME_ZONE_BUFSIZ 8 +#define SYS_WAKEUP_PATH_TEMPLATE "/sys/class/rtc/%s/device/power/wakeup" +#define SYS_POWER_STATE_PATH "/sys/power/state" +#define DEFAULT_RTC_DEVICE "/dev/rtc0" + +enum rtc_modes { /* manual page --mode option explains these. */ + OFF_MODE = 0, + NO_MODE, + ON_MODE, + DISABLE_MODE, + SHOW_MODE, + + SYSFS_MODE /* keep it last */ + +}; + +static const char *rtcwake_mode_string[] = { + [OFF_MODE] = "off", + [NO_MODE] = "no", + [ON_MODE] = "on", + [DISABLE_MODE] = "disable", + [SHOW_MODE] = "show" +}; + +enum clock_modes { + CM_AUTO, + CM_UTC, + CM_LOCAL +}; + +struct rtcwake_control { + char *mode_str; /* name of the requested mode */ + char **possible_modes; /* modes listed in /sys/power/state */ + char *adjfile; /* adjtime file path */ + enum clock_modes clock_mode; /* hwclock timezone */ + time_t sys_time; /* system time */ + time_t rtc_time; /* hardware time */ + unsigned int verbose:1, /* verbose messaging */ + dryrun:1; /* do not set alarm, suspend system, etc */ +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Enter a system sleep state until a specified wakeup time.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --auto reads the clock mode from adjust file (default)\n"), out); + fprintf(out, + _(" -A, --adjfile <file> specifies the path to the adjust file\n" + " the default is %s\n"), _PATH_ADJTIME); + fputs(_(" --date <timestamp> date time of timestamp to wake\n"), out); + fputs(_(" -d, --device <device> select rtc device (rtc0|rtc1|...)\n"), out); + fputs(_(" -n, --dry-run does everything, but suspend\n"), out); + fputs(_(" -l, --local RTC uses local timezone\n"), out); + fputs(_(" --list-modes list available modes\n"), out); + fputs(_(" -m, --mode <mode> standby|mem|... sleep mode\n"), out); + fputs(_(" -s, --seconds <seconds> seconds to sleep\n"), out); + fputs(_(" -t, --time <time_t> time to wake\n"), out); + fputs(_(" -u, --utc RTC uses UTC\n"), out); + fputs(_(" -v, --verbose verbose messages\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(26)); + printf(USAGE_MAN_TAIL("rtcwake(8)")); + exit(EXIT_SUCCESS); +} + +static int is_wakeup_enabled(const char *devname) +{ + char buf[128], *s; + FILE *f; + size_t skip = 0; + + if (startswith(devname, "/dev/")) + skip = 5; + snprintf(buf, sizeof buf, SYS_WAKEUP_PATH_TEMPLATE, devname + skip); + f = fopen(buf, "r"); + if (!f) { + warn(_("cannot open %s"), buf); + return 0; + } + + s = fgets(buf, sizeof buf, f); + fclose(f); + if (!s) + return 0; + s = strchr(buf, '\n'); + if (!s) + return 0; + *s = 0; + /* wakeup events could be disabled or not supported */ + return strcmp(buf, "enabled") == 0; +} + +static int get_basetimes(struct rtcwake_control *ctl, int fd) +{ + struct tm tm = { 0 }; + struct rtc_time rtc; + + /* This process works in RTC time, except when working + * with the system clock (which always uses UTC). + */ + if (ctl->clock_mode == CM_UTC) + xsetenv("TZ", "UTC", 1); + tzset(); + /* Read rtc and system clocks "at the same time", or as + * precisely (+/- a second) as we can read them. + */ + if (ioctl(fd, RTC_RD_TIME, &rtc) < 0) { + warn(_("read rtc time failed")); + return -1; + } + + ctl->sys_time = time(NULL); + if (ctl->sys_time == (time_t)-1) { + warn(_("read system time failed")); + return -1; + } + /* Convert rtc_time to normal arithmetic-friendly form, + * updating tm.tm_wday as used by asctime(). + */ + tm.tm_sec = rtc.tm_sec; + tm.tm_min = rtc.tm_min; + tm.tm_hour = rtc.tm_hour; + tm.tm_mday = rtc.tm_mday; + tm.tm_mon = rtc.tm_mon; + tm.tm_year = rtc.tm_year; + tm.tm_isdst = -1; /* assume the system knows better than the RTC */ + + ctl->rtc_time = mktime(&tm); + if (ctl->rtc_time == (time_t)-1) { + warn(_("convert rtc time failed")); + return -1; + } + + if (ctl->verbose) { + /* Unless the system uses UTC, either delta or tzone + * reflects a seconds offset from UTC. The value can + * help sort out problems like bugs in your C library. */ + printf("\tdelta = %ld\n", ctl->sys_time - ctl->rtc_time); + printf("\ttzone = %ld\n", timezone); + printf("\ttzname = %s\n", tzname[daylight]); + gmtime_r(&ctl->rtc_time, &tm); + printf("\tsystime = %ld, (UTC) %s", + (long) ctl->sys_time, asctime(gmtime(&ctl->sys_time))); + printf("\trtctime = %ld, (UTC) %s", + (long) ctl->rtc_time, asctime(&tm)); + } + return 0; +} + +static int setup_alarm(struct rtcwake_control *ctl, int fd, time_t *wakeup) +{ + struct tm *tm; + struct rtc_wkalrm wake = { 0 }; + + /* The wakeup time is in POSIX time (more or less UTC). Ideally + * RTCs use that same time; but PCs can't do that if they need to + * boot MS-Windows. Messy... + * + * When clock_mode == CM_UTC this process's timezone is UTC, so + * we'll pass a UTC date to the RTC. + * + * Else clock_mode == CM_LOCAL so the time given to the RTC will + * instead use the local time zone. */ + tm = localtime(wakeup); + wake.time.tm_sec = tm->tm_sec; + wake.time.tm_min = tm->tm_min; + wake.time.tm_hour = tm->tm_hour; + wake.time.tm_mday = tm->tm_mday; + wake.time.tm_mon = tm->tm_mon; + wake.time.tm_year = tm->tm_year; + /* wday, yday, and isdst fields are unused */ + wake.time.tm_wday = -1; + wake.time.tm_yday = -1; + wake.time.tm_isdst = -1; + wake.enabled = 1; + + if (!ctl->dryrun && ioctl(fd, RTC_WKALM_SET, &wake) < 0) { + warn(_("set rtc wake alarm failed")); + return -1; + } + return 0; +} + +static char **get_sys_power_states(struct rtcwake_control *ctl) +{ + int fd = -1; + + if (!ctl->possible_modes) { + char buf[256] = { 0 }; + + fd = open(SYS_POWER_STATE_PATH, O_RDONLY); + if (fd < 0) + goto nothing; + if (read(fd, &buf, sizeof(buf) - 1) <= 0) + goto nothing; + ctl->possible_modes = strv_split(buf, " \n"); + close(fd); + } + return ctl->possible_modes; +nothing: + if (fd >= 0) + close(fd); + return NULL; +} + +static void wait_stdin(struct rtcwake_control *ctl) +{ + struct pollfd fd[] = { + {.fd = STDIN_FILENO, .events = POLLIN} + }; + int tries = 0; + + while (tries < 8 && poll(fd, 1, 10) == 1) { + if (ctl->verbose) + warnx(_("discarding stdin")); + xusleep(250000); + tcflush(STDIN_FILENO, TCIFLUSH); + tries++; + } +} + +static void suspend_system(struct rtcwake_control *ctl) +{ + FILE *f = fopen(SYS_POWER_STATE_PATH, "w"); + + if (!f) { + warn(_("cannot open %s"), SYS_POWER_STATE_PATH); + return; + } + + if (!ctl->dryrun) { + if (isatty(STDIN_FILENO)) + wait_stdin(ctl); + fprintf(f, "%s\n", ctl->mode_str); + fflush(f); + } + /* this executes after wake from suspend */ + if (close_stream(f)) + errx(EXIT_FAILURE, _("write error")); +} + +static int read_clock_mode(struct rtcwake_control *ctl) +{ + FILE *fp; + char linebuf[ADJTIME_ZONE_BUFSIZ]; + + fp = fopen(ctl->adjfile, "r"); + if (!fp) + return -1; + /* skip two lines */ + if (skip_fline(fp) || skip_fline(fp)) { + fclose(fp); + return -1; + } + /* read third line */ + if (!fgets(linebuf, sizeof linebuf, fp)) { + fclose(fp); + return -1; + } + + if (strncmp(linebuf, "UTC", 3) == 0) + ctl->clock_mode = CM_UTC; + else if (strncmp(linebuf, "LOCAL", 5) == 0) + ctl->clock_mode = CM_LOCAL; + else if (ctl->verbose) + warnx(_("unexpected third line in: %s: %s"), ctl->adjfile, linebuf); + + fclose(fp); + return 0; +} + +static int print_alarm(struct rtcwake_control *ctl, int fd) +{ + struct rtc_wkalrm wake; + struct tm tm = { 0 }; + time_t alarm; + + if (ioctl(fd, RTC_WKALM_RD, &wake) < 0) { + warn(_("read rtc alarm failed")); + return -1; + } + + if (wake.enabled != 1 || wake.time.tm_year == -1) { + printf(_("alarm: off\n")); + return 0; + } + tm.tm_sec = wake.time.tm_sec; + tm.tm_min = wake.time.tm_min; + tm.tm_hour = wake.time.tm_hour; + tm.tm_mday = wake.time.tm_mday; + tm.tm_mon = wake.time.tm_mon; + tm.tm_year = wake.time.tm_year; + tm.tm_isdst = -1; /* assume the system knows better than the RTC */ + + alarm = mktime(&tm); + if (alarm == (time_t)-1) { + warn(_("convert time failed")); + return -1; + } + /* 0 if both UTC, or expresses diff if RTC in local time */ + alarm += ctl->sys_time - ctl->rtc_time; + printf(_("alarm: on %s"), ctime(&alarm)); + + return 0; +} + +static int get_rtc_mode(struct rtcwake_control *ctl, const char *s) +{ + size_t i; + char **modes = get_sys_power_states(ctl), **m; + + STRV_FOREACH(m, modes) { + if (strcmp(s, *m) == 0) + return SYSFS_MODE; + } + + for (i = 0; i < ARRAY_SIZE(rtcwake_mode_string); i++) + if (!strcmp(s, rtcwake_mode_string[i])) + return i; + + return -EINVAL; +} + +static int open_dev_rtc(const char *devname) +{ + int fd; + char *devpath = NULL; + + if (startswith(devname, "/dev")) + devpath = xstrdup(devname); + else + xasprintf(&devpath, "/dev/%s", devname); + fd = open(devpath, O_RDONLY | O_CLOEXEC); + if (fd < 0) + err(EXIT_FAILURE, _("%s: unable to find device"), devpath); + free(devpath); + return fd; +} + +static void list_modes(struct rtcwake_control *ctl) +{ + size_t i; + char **modes = get_sys_power_states(ctl), **m; + + if (!modes) + errx(EXIT_FAILURE, _("could not read: %s"), SYS_POWER_STATE_PATH); + + STRV_FOREACH(m, modes) + printf("%s ", *m); + + for (i = 0; i < ARRAY_SIZE(rtcwake_mode_string); i++) + printf("%s ", rtcwake_mode_string[i]); + putchar('\n'); +} + +int main(int argc, char **argv) +{ + struct rtcwake_control ctl = { + .mode_str = "suspend", /* default mode */ + .adjfile = _PATH_ADJTIME, + .clock_mode = CM_AUTO + }; + char *devname = DEFAULT_RTC_DEVICE; + unsigned seconds = 0; + int suspend = SYSFS_MODE; + int rc = EXIT_SUCCESS; + int t; + int fd; + time_t alarm = 0; + enum { + OPT_DATE = CHAR_MAX + 1, + OPT_LIST + }; + static const struct option long_options[] = { + { "adjfile", required_argument, NULL, 'A' }, + { "auto", no_argument, NULL, 'a' }, + { "dry-run", no_argument, NULL, 'n' }, + { "local", no_argument, NULL, 'l' }, + { "utc", no_argument, NULL, 'u' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { "mode", required_argument, NULL, 'm' }, + { "device", required_argument, NULL, 'd' }, + { "seconds", required_argument, NULL, 's' }, + { "time", required_argument, NULL, 't' }, + { "date", required_argument, NULL, OPT_DATE }, + { "list-modes", no_argument, NULL, OPT_LIST }, + { NULL, 0, NULL, 0 } + }; + static const ul_excl_t excl[] = { + { 'a', 'l', 'u' }, + { 's', 't', OPT_DATE }, + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((t = getopt_long(argc, argv, "A:ahd:lm:ns:t:uVv", + long_options, NULL)) != EOF) { + err_exclusive_options(t, long_options, excl, excl_st); + switch (t) { + case 'A': + /* for better compatibility with hwclock */ + ctl.adjfile = optarg; + break; + case 'a': + ctl.clock_mode = CM_AUTO; + break; + case 'd': + devname = optarg; + break; + case 'l': + ctl.clock_mode = CM_LOCAL; + break; + + case OPT_LIST: + list_modes(&ctl); + return EXIT_SUCCESS; + + case 'm': + if ((suspend = get_rtc_mode(&ctl, optarg)) < 0) + errx(EXIT_FAILURE, _("unrecognized suspend state '%s'"), optarg); + ctl.mode_str = optarg; + break; + case 'n': + ctl.dryrun = 1; + break; + case 's': + /* alarm time, seconds-to-sleep (relative) */ + seconds = strtou32_or_err(optarg, _("invalid seconds argument")); + break; + case 't': + /* alarm time, time_t (absolute, seconds since epoch) */ + alarm = strtou32_or_err(optarg, _("invalid time argument")); + break; + case OPT_DATE: + { /* alarm time, see timestamp format from manual */ + usec_t p; + if (parse_timestamp(optarg, &p) < 0) + errx(EXIT_FAILURE, _("invalid time value \"%s\""), optarg); + alarm = (time_t) (p / 1000000); + break; + } + case 'u': + ctl.clock_mode = CM_UTC; + break; + case 'v': + ctl.verbose = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + exit(EXIT_SUCCESS); + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (ctl.clock_mode == CM_AUTO && read_clock_mode(&ctl) < 0) { + printf(_("%s: assuming RTC uses UTC ...\n"), program_invocation_short_name); + ctl.clock_mode = CM_UTC; + } + + if (ctl.verbose) + printf("%s", ctl.clock_mode == CM_UTC ? _("Using UTC time.\n") : + _("Using local time.\n")); + + if (!alarm && !seconds && suspend != DISABLE_MODE && suspend != SHOW_MODE) + errx(EXIT_FAILURE, _("must provide wake time (see --seconds, --time and --date options)")); + + /* device must exist and (if we'll sleep) be wakeup-enabled */ + fd = open_dev_rtc(devname); + + if (suspend != ON_MODE && suspend != NO_MODE && !is_wakeup_enabled(devname)) + errx(EXIT_FAILURE, _("%s not enabled for wakeup events"), devname); + + /* relative or absolute alarm time, normalized to time_t */ + if (get_basetimes(&ctl, fd) < 0) + exit(EXIT_FAILURE); + + if (ctl.verbose) + printf(_("alarm %ld, sys_time %ld, rtc_time %ld, seconds %u\n"), + alarm, ctl.sys_time, ctl.rtc_time, seconds); + + if (suspend != DISABLE_MODE && suspend != SHOW_MODE) { + /* perform alarm setup when the show or disable modes are not set */ + if (alarm) { + if (alarm < ctl.sys_time) + errx(EXIT_FAILURE, _("time doesn't go backward to %s"), + ctime(&alarm)); + alarm -= ctl.sys_time - ctl.rtc_time; + } else + alarm = ctl.rtc_time + seconds + 1; + + if (setup_alarm(&ctl, fd, &alarm) < 0) + exit(EXIT_FAILURE); + + if (suspend == NO_MODE || suspend == ON_MODE) + printf(_("%s: wakeup using %s at %s"), + program_invocation_short_name, devname, + ctime(&alarm)); + else + printf(_("%s: wakeup from \"%s\" using %s at %s"), + program_invocation_short_name, ctl.mode_str, devname, + ctime(&alarm)); + fflush(stdout); + xusleep(10 * 1000); + } + + switch (suspend) { + case NO_MODE: + if (ctl.verbose) + printf(_("suspend mode: no; leaving\n")); + ctl.dryrun = 1; /* to skip disabling alarm at the end */ + break; + case OFF_MODE: + { + char *arg[5]; + int i = 0; + + if (ctl.verbose) + printf(_("suspend mode: off; executing %s\n"), + _PATH_SHUTDOWN); + arg[i++] = _PATH_SHUTDOWN; + arg[i++] = "-h"; + arg[i++] = "-P"; + arg[i++] = "now"; + arg[i] = NULL; + if (!ctl.dryrun) { + execv(arg[0], arg); + warn(_("failed to execute %s"), _PATH_SHUTDOWN); + rc = EXIT_FAILURE; + } + break; + } + case ON_MODE: + { + unsigned long data; + + if (ctl.verbose) + printf(_("suspend mode: on; reading rtc\n")); + if (!ctl.dryrun) { + do { + t = read(fd, &data, sizeof data); + if (t < 0) { + warn(_("rtc read failed")); + break; + } + if (ctl.verbose) + printf("... %s: %03lx\n", devname, data); + } while (!(data & RTC_AF)); + } + break; + } + case DISABLE_MODE: + /* just break, alarm gets disabled in the end */ + if (ctl.verbose) + printf(_("suspend mode: disable; disabling alarm\n")); + break; + case SHOW_MODE: + if (ctl.verbose) + printf(_("suspend mode: show; printing alarm info\n")); + if (print_alarm(&ctl, fd)) + rc = EXIT_FAILURE; + ctl.dryrun = 1; /* don't really disable alarm in the end, just show */ + break; + default: + if (ctl.verbose) + printf(_("suspend mode: %s; suspending system\n"), ctl.mode_str); + sync(); + suspend_system(&ctl); + } + + if (!ctl.dryrun) { + struct rtc_wkalrm wake; + + if (ioctl(fd, RTC_WKALM_RD, &wake) < 0) { + warn(_("read rtc alarm failed")); + rc = EXIT_FAILURE; + } else { + wake.enabled = 0; + if (ioctl(fd, RTC_WKALM_SET, &wake) < 0) { + warn(_("disable rtc alarm interrupt failed")); + rc = EXIT_FAILURE; + } + } + } + + close(fd); + return rc; +} diff --git a/sys-utils/setarch.8 b/sys-utils/setarch.8 new file mode 100644 index 0000000..efa3d50 --- /dev/null +++ b/sys-utils/setarch.8 @@ -0,0 +1,143 @@ +.TH SETARCH 8 "December 2017" "util-linux" "System Administration" +.SH NAME +setarch \- change reported architecture in new program environment and/or set personality flags +.SH SYNOPSIS +.B setarch +.RI [ arch ] +[options] +.RI [ program +.RI [ argument ...]] +.sp +.B setarch +.BR \-\-list | \-h | \-V +.sp +.B arch +[options] +.RI [ program +.RI [ argument ...]] +.SH DESCRIPTION +.B setarch +modifies execution domains and process personality flags. +.PP +The execution domains currently only affects the output of \fBuname -m\fR. +For example, on an AMD64 system, running \fBsetarch i386 \fIprogram\fR +will cause \fIprogram\fR to see i686 instead of x86_64 as the machine type. +It also allows to set various personality options. +The default \fIprogram\fR is \fB/bin/sh\fR. +.PP +Since version 2.33 the +.I arch +command line argument is optional and +.B setarch +may be used to change personality flags (ADDR_LIMIT_*, SHORT_INODE, etc) without +modification of the execution domain. +.SH OPTIONS +.TP +.B \-\-list +List the architectures that \fBsetarch\fR knows about. Whether \fBsetarch\fR +can actually set each of these architectures depends on the running kernel. +.TP +.B \-\-uname\-2.6 +Causes the \fIprogram\fR to see a kernel version number beginning with 2.6. +Turns on UNAME26. +.TP +.BR \-v , " \-\-verbose" +Be verbose. +.TP +\fB\-3\fR, \fB\-\-3gb\fR +Specifies +.I program +should use a maximum of 3GB of address space. Supported on x86. Turns on +ADDR_LIMIT_3GB. +.TP +\fB\-\-4gb\fR +This option has no effect. It is retained for backward compatibility only, +and may be removed in future releases. +.TP +\fB\-B\fR, \fB\-\-32bit\fR +Limit the address space to 32 bits to emulate hardware. Supported on ARM +and Alpha. Turns on ADDR_LIMIT_32BIT. +.TP +\fB\-F\fR, \fB\-\-fdpic\-funcptrs\fR +Treat user-space function pointers to signal handlers as pointers to address +descriptors. This option has no effect on architectures that do not support +FDPIC ELF binaries. In kernel v4.14 support is limited to ARM, Blackfin, +Fujitsu FR-V, and SuperH CPU architectures. +.TP +\fB\-I\fR, \fB\-\-short\-inode\fR +Obsolete bug emulation flag. Turns on SHORT_INODE. +.TP +\fB\-L\fR, \fB\-\-addr\-compat\-layout\fR +Provide legacy virtual address space layout. Use when the +.I program +binary does not have PT_GNU_STACK ELF header. Turns on +ADDR_COMPAT_LAYOUT. +.TP +\fB\-R\fR, \fB\-\-addr\-no\-randomize\fR +Disables randomization of the virtual address space. Turns on +ADDR_NO_RANDOMIZE. +.TP +\fB\-S\fR, \fB\-\-whole\-seconds\fR +Obsolete bug emulation flag. Turns on WHOLE_SECONDS. +.TP +\fB\-T\fR, \fB\-\-sticky\-timeouts\fR +This makes +.BR select (2), +.BR pselect (2), +and +.BR ppoll (2) +system calls preserve the timeout value instead of modifying it to reflect +the amount of time not slept when interrupted by a signal handler. Use when +.I program +depends on this behavior. For more details see the timeout description in +.BR select (2) +manual page. Turns on STICKY_TIMEOUTS. +.TP +\fB\-X\fR, \fB\-\-read\-implies\-exec\fR +If this is set then +.BR mmap (3) +PROT_READ will also add the PROT_EXEC bit - as expected by legacy x86 +binaries. Notice that the ELF loader will automatically set this bit when +it encounters a legacy binary. Turns on READ_IMPLIES_EXEC. +.TP +\fB\-Z\fR, \fB\-\-mmap\-page\-zero\fR +SVr4 bug emulation that will set +.BR mmap (3) +page zero as read-only. Use when +.I program +depends on this behavior, and the source code is not available to be fixed. +Turns on MMAP_PAGE_ZERO. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH EXAMPLES +setarch --addr-no-randomize mytestprog +.br +setarch ppc32 rpmbuild --target=ppc --rebuild foo.src.rpm +.br +setarch ppc32 -v -vL3 rpmbuild --target=ppc --rebuild bar.src.rpm +.br +setarch ppc32 --32bit rpmbuild --target=ppc --rebuild foo.src.rpm +.SH AUTHOR +.MT sopwith@redhat.com +Elliot Lee +.ME +.br +.MT jnovy@redhat.com +Jindrich Novy +.ME +.br +.MT kzak@redhat.com +Karel Zak +.ME +.SH "SEE ALSO" +.BR personality (2), +.BR select (2) +.SH AVAILABILITY +The setarch command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/setarch.c b/sys-utils/setarch.c new file mode 100644 index 0000000..7c0a63f --- /dev/null +++ b/sys-utils/setarch.c @@ -0,0 +1,446 @@ +/* + * Copyright (C) 2003-2007 Red Hat, Inc. + * + * This file is part of util-linux. + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Written by Elliot Lee <sopwith@redhat.com> + * New personality options & code added by Jindrich Novy <jnovy@redhat.com> + * ADD_NO_RANDOMIZE flag added by Arjan van de Ven <arjanv@redhat.com> + * Help and MIPS support from Mike Frysinger (vapier@gentoo.org) + * Better error handling from Dmitry V. Levin (ldv@altlinux.org) + * + * based on ideas from the ppc32 util by Guy Streeter (2002-01), based on the + * sparc32 util by Jakub Jelinek (1998, 1999) + */ + +#include <sys/personality.h> +#include <unistd.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <getopt.h> +#include <limits.h> +#include <sys/utsname.h> +#include "nls.h" +#include "c.h" +#include "closestream.h" + +#ifndef HAVE_PERSONALITY +# include <syscall.h> +# define personality(pers) ((long)syscall(SYS_personality, pers)) +#endif + +#define turn_on(_flag, _opts) \ + do { \ + (_opts) |= _flag; \ + if (verbose) \ + printf(_("Switching on %s.\n"), #_flag); \ + } while(0) + +#ifndef UNAME26 +# define UNAME26 0x0020000 +#endif +#ifndef ADDR_NO_RANDOMIZE +# define ADDR_NO_RANDOMIZE 0x0040000 +#endif +#ifndef FDPIC_FUNCPTRS +# define FDPIC_FUNCPTRS 0x0080000 +#endif +#ifndef MMAP_PAGE_ZERO +# define MMAP_PAGE_ZERO 0x0100000 +#endif +#ifndef ADDR_COMPAT_LAYOUT +# define ADDR_COMPAT_LAYOUT 0x0200000 +#endif +#ifndef READ_IMPLIES_EXEC +# define READ_IMPLIES_EXEC 0x0400000 +#endif +#ifndef ADDR_LIMIT_32BIT +# define ADDR_LIMIT_32BIT 0x0800000 +#endif +#ifndef SHORT_INODE +# define SHORT_INODE 0x1000000 +#endif +#ifndef WHOLE_SECONDS +# define WHOLE_SECONDS 0x2000000 +#endif +#ifndef STICKY_TIMEOUTS +# define STICKY_TIMEOUTS 0x4000000 +#endif +#ifndef ADDR_LIMIT_3GB +# define ADDR_LIMIT_3GB 0x8000000 +#endif + + +struct arch_domain { + int perval; /* PER_* */ + const char *target_arch; + const char *result_arch; +}; + + +static void __attribute__((__noreturn__)) usage(int archwrapper) +{ + fputs(USAGE_HEADER, stdout); + if (!archwrapper) + printf(_(" %s [<arch>] [options] [<program> [<argument>...]]\n"), program_invocation_short_name); + else + printf(_(" %s [options] [<program> [<argument>...]]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, stdout); + fputs(_("Change the reported architecture and set personality flags.\n"), stdout); + + fputs(USAGE_OPTIONS, stdout); + fputs(_(" -B, --32bit turns on ADDR_LIMIT_32BIT\n"), stdout); + fputs(_(" -F, --fdpic-funcptrs makes function pointers point to descriptors\n"), stdout); + fputs(_(" -I, --short-inode turns on SHORT_INODE\n"), stdout); + fputs(_(" -L, --addr-compat-layout changes the way virtual memory is allocated\n"), stdout); + fputs(_(" -R, --addr-no-randomize disables randomization of the virtual address space\n"), stdout); + fputs(_(" -S, --whole-seconds turns on WHOLE_SECONDS\n"), stdout); + fputs(_(" -T, --sticky-timeouts turns on STICKY_TIMEOUTS\n"), stdout); + fputs(_(" -X, --read-implies-exec turns on READ_IMPLIES_EXEC\n"), stdout); + fputs(_(" -Z, --mmap-page-zero turns on MMAP_PAGE_ZERO\n"), stdout); + fputs(_(" -3, --3gb limits the used address space to a maximum of 3 GB\n"), stdout); + fputs(_(" --4gb ignored (for backward compatibility only)\n"), stdout); + fputs(_(" --uname-2.6 turns on UNAME26\n"), stdout); + fputs(_(" -v, --verbose say what options are being switched on\n"), stdout); + + if (!archwrapper) + fputs(_(" --list list settable architectures, and exit\n"), stdout); + + fputs(USAGE_SEPARATOR, stdout); + printf(USAGE_HELP_OPTIONS(26)); + printf(USAGE_MAN_TAIL("setarch(8)")); + + exit(EXIT_SUCCESS); +} + +/* + * Returns inilialized list of all available execution domains. + */ +static struct arch_domain *init_arch_domains(void) +{ + struct utsname un; + size_t i; + + static struct arch_domain transitions[] = + { + {UNAME26, "uname26", NULL}, + {PER_LINUX32, "linux32", NULL}, + {PER_LINUX, "linux64", NULL}, +#if defined(__powerpc__) || defined(__powerpc64__) +# ifdef __BIG_ENDIAN__ + {PER_LINUX32, "ppc32", "ppc"}, + {PER_LINUX32, "ppc", "ppc"}, + {PER_LINUX, "ppc64", "ppc64"}, + {PER_LINUX, "ppc64pseries", "ppc64"}, + {PER_LINUX, "ppc64iseries", "ppc64"}, +# else + {PER_LINUX32, "ppc32", "ppcle"}, + {PER_LINUX32, "ppc", "ppcle"}, + {PER_LINUX32, "ppc32le", "ppcle"}, + {PER_LINUX32, "ppcle", "ppcle"}, + {PER_LINUX, "ppc64le", "ppc64le"}, +# endif +#endif +#if defined(__x86_64__) || defined(__i386__) || defined(__ia64__) + {PER_LINUX32, "i386", "i386"}, + {PER_LINUX32, "i486", "i386"}, + {PER_LINUX32, "i586", "i386"}, + {PER_LINUX32, "i686", "i386"}, + {PER_LINUX32, "athlon", "i386"}, +#endif +#if defined(__x86_64__) || defined(__i386__) + {PER_LINUX, "x86_64", "x86_64"}, +#endif +#if defined(__ia64__) || defined(__i386__) + {PER_LINUX, "ia64", "ia64"}, +#endif +#if defined(__hppa__) + {PER_LINUX32, "parisc32", "parisc"}, + {PER_LINUX32, "parisc", "parisc"}, + {PER_LINUX, "parisc64", "parisc64"}, +#endif +#if defined(__s390x__) || defined(__s390__) + {PER_LINUX32, "s390", "s390"}, + {PER_LINUX, "s390x", "s390x"}, +#endif +#if defined(__sparc64__) || defined(__sparc__) + {PER_LINUX32, "sparc", "sparc"}, + {PER_LINUX32, "sparc32bash", "sparc"}, + {PER_LINUX32, "sparc32", "sparc"}, + {PER_LINUX, "sparc64", "sparc64"}, +#endif +#if defined(__mips64__) || defined(__mips__) + {PER_LINUX32, "mips32", "mips"}, + {PER_LINUX32, "mips", "mips"}, + {PER_LINUX, "mips64", "mips64"}, +#endif +#if defined(__alpha__) + {PER_LINUX, "alpha", "alpha"}, + {PER_LINUX, "alphaev5", "alpha"}, + {PER_LINUX, "alphaev56", "alpha"}, + {PER_LINUX, "alphaev6", "alpha"}, + {PER_LINUX, "alphaev67", "alpha"}, +#endif + /* place holder, will be filled up at runtime */ + {-1, NULL, NULL}, + {-1, NULL, NULL} + }; + + /* Add the trivial transition {PER_LINUX, machine, machine} if no + * such target_arch is hardcoded yet. */ + uname(&un); + for (i = 0; transitions[i].perval >= 0; i++) + if (!strcmp(un.machine, transitions[i].target_arch)) + break; + if (transitions[i].perval < 0) { + unsigned long wrdsz = CHAR_BIT * sizeof(void *); + if (wrdsz == 32 || wrdsz == 64) { + /* fill up the place holder */ + transitions[i].perval = wrdsz == 32 ? PER_LINUX32 : PER_LINUX; + transitions[i].target_arch = un.machine; + transitions[i].result_arch = un.machine; + } + } + + return transitions; +} + +/* + * List all execution domains from transitions + */ +static void list_arch_domains(struct arch_domain *doms) +{ + struct arch_domain *d; + + for (d = doms; d->target_arch != NULL; d++) + printf("%s\n", d->target_arch); +} + +static struct arch_domain *get_arch_domain(struct arch_domain *doms, const char *pers) +{ + struct arch_domain *d; + + for (d = doms; d->perval >= 0; d++) { + if (!strcmp(pers, d->target_arch)) + break; + } + + return !d || d->perval < 0 ? NULL : d; +} + +static void verify_arch_domain(struct arch_domain *dom, const char *wanted) +{ + struct utsname un; + + if (!dom || !dom->result_arch) + return; + + uname(&un); + if (strcmp(un.machine, dom->result_arch)) { + if (strcmp(dom->result_arch, "i386") + || (strcmp(un.machine, "i486") + && strcmp(un.machine, "i586") + && strcmp(un.machine, "i686") + && strcmp(un.machine, "athlon"))) + errx(EXIT_FAILURE, _("Kernel cannot set architecture to %s"), wanted); + } +} + +int main(int argc, char *argv[]) +{ + const char *arch = NULL; + unsigned long options = 0; + int verbose = 0; + int archwrapper; + int c; + struct arch_domain *doms, *target; + unsigned long pers_value = 0; + char *shell = NULL, *shell_arg = NULL; + + /* Options without equivalent short options */ + enum { + OPT_4GB = CHAR_MAX + 1, + OPT_UNAME26, + OPT_LIST + }; + + /* Options --3gb and --4gb are for compatibility with an old + * Debian setarch implementation. */ + static const struct option longopts[] = { + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {"verbose", no_argument, NULL, 'v'}, + {"addr-no-randomize", no_argument, NULL, 'R'}, + {"fdpic-funcptrs", no_argument, NULL, 'F'}, + {"mmap-page-zero", no_argument, NULL, 'Z'}, + {"addr-compat-layout", no_argument, NULL, 'L'}, + {"read-implies-exec", no_argument, NULL, 'X'}, + {"32bit", no_argument, NULL, 'B'}, + {"short-inode", no_argument, NULL, 'I'}, + {"whole-seconds", no_argument, NULL, 'S'}, + {"sticky-timeouts", no_argument, NULL, 'T'}, + {"3gb", no_argument, NULL, '3'}, + {"4gb", no_argument, NULL, OPT_4GB}, + {"uname-2.6", no_argument, NULL, OPT_UNAME26}, + {"list", no_argument, NULL, OPT_LIST}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + if (argc < 1) { + warnx(_("Not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + archwrapper = strcmp(program_invocation_short_name, "setarch") != 0; + if (archwrapper) { + arch = program_invocation_short_name; /* symlinks to setarch */ + + /* Don't use ifdef sparc here, we get "Unrecognized architecture" + * error message later if necessary */ + if (strcmp(arch, "sparc32bash") == 0) { + shell = "/bin/bash"; + shell_arg = ""; + goto set_arch; + } + } else { + if (1 < argc && *argv[1] != '-') { + arch = argv[1]; + argv[1] = argv[0]; /* for getopt_long() to get the program name */ + argv++; + argc--; + } + } + + while ((c = getopt_long(argc, argv, "+hVv3BFILRSTXZ", longopts, NULL)) != -1) { + switch (c) { + case 'h': + usage(archwrapper); + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'v': + verbose = 1; + break; + case 'R': + turn_on(ADDR_NO_RANDOMIZE, options); + break; + case 'F': + turn_on(FDPIC_FUNCPTRS, options); + break; + case 'Z': + turn_on(MMAP_PAGE_ZERO, options); + break; + case 'L': + turn_on(ADDR_COMPAT_LAYOUT, options); + break; + case 'X': + turn_on(READ_IMPLIES_EXEC, options); + break; + case 'B': + turn_on(ADDR_LIMIT_32BIT, options); + break; + case 'I': + turn_on(SHORT_INODE, options); + break; + case 'S': + turn_on(WHOLE_SECONDS, options); + break; + case 'T': + turn_on(STICKY_TIMEOUTS, options); + break; + case '3': + turn_on(ADDR_LIMIT_3GB, options); + break; + case OPT_4GB: /* just ignore this one */ + break; + case OPT_UNAME26: + turn_on(UNAME26, options); + break; + case OPT_LIST: + if (!archwrapper) { + list_arch_domains(init_arch_domains()); + return EXIT_SUCCESS; + } else + warnx(_("unrecognized option '--list'")); + /* fallthrough */ + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (!arch && !options) + errx(EXIT_FAILURE, _("no architecture argument or personality flags specified")); + + argc -= optind; + argv += optind; + +set_arch: + /* get execution domain (architecture) */ + if (arch) { + doms = init_arch_domains(); + target = get_arch_domain(doms, arch); + + if (!target) + errx(EXIT_FAILURE, _("%s: Unrecognized architecture"), arch); + pers_value = target->perval; + } + + /* add personality flags */ + pers_value |= options; + + /* call kernel */ + if (personality(pers_value) < 0) { + /* + * Depending on architecture and kernel version, personality + * syscall is either capable or incapable of returning an error. + * If the return value is not an error, then it's the previous + * personality value, which can be an arbitrary value + * undistinguishable from an error value. + * To make things clear, a second call is needed. + */ + if (personality(pers_value) < 0) + err(EXIT_FAILURE, _("failed to set personality to %s"), arch); + } + + /* make sure architecture is set as expected */ + if (arch) + verify_arch_domain(target, arch); + + if (!argc) { + shell = "/bin/sh"; + shell_arg = "-sh"; + } + if (verbose) { + printf(_("Execute command `%s'.\n"), shell ? shell : argv[0]); + /* flush all output streams before exec */ + fflush(NULL); + } + + /* Execute shell */ + if (shell) { + execl(shell, shell_arg, NULL); + errexec(shell); + } + + /* Execute on command line specified command */ + execvp(argv[0], argv); + errexec(argv[0]); +} diff --git a/sys-utils/setpriv.1 b/sys-utils/setpriv.1 new file mode 100644 index 0000000..9ff9058 --- /dev/null +++ b/sys-utils/setpriv.1 @@ -0,0 +1,222 @@ +.TH SETPRIV 1 "July 2014" "util-linux" "User Commands" +.SH NAME +setpriv \- run a program with different Linux privilege settings +.SH SYNOPSIS +.B setpriv +[options] +.I program +.RI [ arguments ] +.SH DESCRIPTION +Sets or queries various Linux privilege settings that are inherited across +.BR execve (2). +.PP +In comparison to +.BR su (1) +and +.BR runuser (1), +.BR setpriv (1) +neither uses PAM, nor does it prompt for a password. +It is a simple, non-set-user-ID wrapper around +.BR execve (2), +and can be used to drop privileges in the same way as +.BR setuidgid (8) +from +.BR daemontools , +.BR chpst (8) +from +.BR runit , +or similar tools shipped by other service managers. +.SH OPTION +.TP +.B \-\-clear\-groups +Clear supplementary groups. +.TP +.BR \-d , " \-\-dump" +Dump current privilege state. Can be specified more than once to show extra, +mostly useless, information. Incompatible with all other options. +.TP +.B \-\-groups \fIgroup\fR... +Set supplementary groups. The argument is a comma-separated list of GIDs or names. +.TP +.BR \-\-inh\-caps " (" + | \- ) \fIcap "... or " \-\-ambient-caps " (" + | \- ) \fIcap "... or " \-\-bounding\-set " (" + | \- ) \fIcap ... +Set the inheritable capabilities, ambient capabilities or the capability bounding set. See +.BR capabilities (7). +The argument is a comma-separated list of +.BI + cap +and +.BI \- cap +entries, which add or remove an entry respectively. \fIcap\fR can either be a +human-readable name as seen in +.BR capabilities (7) +without the \fIcap_\fR prefix or of the format +.BI cap_N , +where \fIN\fR is the internal capability index used by Linux. +.B +all +and +.B \-all +can be used to add or remove all caps. The set of capabilities starts out as +the current inheritable set for +.BR \-\-inh\-caps , +the current ambient set for +.B \-\-ambient\-caps +and the current bounding set for +.BR \-\-bounding\-set . +If you drop something from the bounding set without also dropping it from the +inheritable set, you are likely to become confused. Do not do that. +.TP +.B \-\-keep\-groups +Preserve supplementary groups. Only useful in conjunction with +.BR \-\-rgid , +.BR \-\-egid ", or" +.BR \-\-regid . +.TP +.B \-\-init\-groups +Initialize supplementary groups using +.BR initgroups "(3)." +Only useful in conjunction with +.B \-\-ruid +or +.BR \-\-reuid . +.TP +.B \-\-list\-caps +List all known capabilities. This option must be specified alone. +.TP +.B \-\-no\-new\-privs +Set the +.I no_new_privs +bit. With this bit set, +.BR execve (2) +will not grant new privileges. +For example, the set-user-ID and set-group-ID bits as well +as file capabilities will be disabled. (Executing binaries with these bits set +will still work, but they will not gain privileges. Certain LSMs, especially +AppArmor, may result in failures to execute certain programs.) This bit is +inherited by child processes and cannot be unset. See +.BR prctl (2) +and +.I Documentation/\:prctl/\:no_\:new_\:privs.txt +in the Linux kernel source. +.sp +The no_new_privs bit is supported since Linux 3.5. +.TP +.BI \-\-rgid " gid\fR, " \-\-egid " gid\fR, " \-\-regid " gid" +Set the real, effective, or both GIDs. The \fIgid\fR argument can be +given as textual group name. +.sp +For safety, you must specify one of +.BR \-\-clear\-groups , +.BR \-\-groups , +.BR \-\-keep\-groups ", or" +.B \-\-init\-groups +if you set any primary +.IR gid . +.TP +.BI \-\-ruid " uid\fR, " \-\-euid " uid\fR, " \-\-reuid " uid" +Set the real, effective, or both UIDs. The \fIuid\fR argument can be +given as textual login name. +.sp +Setting a +.I uid +or +.I gid +does not change capabilities, although the exec call at the end might change +capabilities. This means that, if you are root, you probably want to do +something like: +.sp +.B " setpriv \-\-reuid=1000 \-\-regid=1000 \-\-inh\-caps=\-all" +.TP +.BR \-\-securebits " (" + | \- ) \fIsecurebit ... +Set or clear securebits. The argument is a comma-separated list. +The valid securebits are +.IR noroot , +.IR noroot_locked , +.IR no_setuid_fixup , +.IR no_setuid_fixup_locked , +and +.IR keep_caps_locked . +.I keep_caps +is cleared by +.BR execve (2) +and is therefore not allowed. +.TP +.BR "\-\-pdeathsig keep" | clear | <signal> +Keep, clear or set the parent death signal. Some LSMs, most notably SELinux and +AppArmor, clear the signal when the process' credentials change. Using +\fB--pdeathsig keep\fR will restore the parent death signal after changing +credentials to remedy that situation. +.TP +.BI \-\-selinux\-label " label" +Request a particular SELinux transition (using a transition on exec, not +dyntrans). This will fail and cause +.BR setpriv (1) +to abort if SELinux is not in use, and the transition may be ignored or cause +.BR execve (2) +to fail at SELinux's whim. (In particular, this is unlikely to work in +conjunction with +.IR no_new_privs .) +This is similar to +.BR runcon (1). +.TP +.BI \-\-apparmor\-profile " profile" +Request a particular AppArmor profile (using a transition on exec). This will +fail and cause +.BR setpriv (1) +to abort if AppArmor is not in use, and the transition may be ignored or cause +.BR execve (2) +to fail at AppArmor's whim. +.TP +.BI \-\-reset\-env +Clears all the environment variables except TERM; initializes the environment variables HOME, SHELL, USER, LOGNAME +according to the user's passwd entry; sets PATH to \fI/usr/local/bin:/bin:/usr/bin\fR for a regual user and to +\fI/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\fR for root. +.sp +The environment variable PATH may be different on systems where /bin and /sbin +are merged into /usr. The environment variable SHELL defaults to \fI/bin/sh\fR if none is given in the user's +passwd entry. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH NOTES +If applying any specified option fails, +.I program +will not be run and +.B setpriv +will return with exit code 127. +.PP +Be careful with this tool \-\- it may have unexpected security consequences. +For example, setting no_new_privs and then execing a program that is +SELinux\-confined (as this tool would do) may prevent the SELinux +restrictions from taking effect. +.SH EXAMPLE +If you're looking for behaviour similar to +.BR su (1)/ runuser "(1), or " sudo (8) +(without the +.B -g +option), try something like: +.sp +.B " setpriv \-\-reuid=1000 \-\-regid=1000 \-\-init\-groups" +.PP +If you want to mimic daemontools' +.BR setuid (8), +try: +.sp +.B " setpriv \-\-reuid=1000 \-\-regid=1000 \-\-clear\-groups" +.SH SEE ALSO +.BR runuser (1), +.BR su (1), +.BR prctl (2), +.BR capabilities (7) +.SH AUTHOR +.MT luto@amacapital.net +Andy Lutomirski +.ME +.SH AVAILABILITY +The +.B setpriv +command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/setpriv.c b/sys-utils/setpriv.c new file mode 100644 index 0000000..828ddc1 --- /dev/null +++ b/sys-utils/setpriv.c @@ -0,0 +1,1096 @@ +/* + * setpriv(1) - set various kernel privilege bits and run something + * + * Copyright (C) 2012 Andy Lutomirski <luto@amacapital.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <cap-ng.h> +#include <errno.h> +#include <getopt.h> +#include <grp.h> +#include <linux/securebits.h> +#include <pwd.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <unistd.h> + +#include "c.h" +#include "closestream.h" +#include "nls.h" +#include "optutils.h" +#include "strutils.h" +#include "xalloc.h" +#include "pathnames.h" +#include "signames.h" +#include "env.h" + +#ifndef PR_SET_NO_NEW_PRIVS +# define PR_SET_NO_NEW_PRIVS 38 +#endif +#ifndef PR_GET_NO_NEW_PRIVS +# define PR_GET_NO_NEW_PRIVS 39 +#endif + +#ifndef PR_CAP_AMBIENT +# define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +#endif + +#define SETPRIV_EXIT_PRIVERR 127 /* how we exit when we fail to set privs */ + +/* The shell to set SHELL env.variable if none is given in the user's passwd entry. */ +#define DEFAULT_SHELL "/bin/sh" + +static gid_t get_group(const char *s, const char *err); + +enum cap_type { + CAP_TYPE_EFFECTIVE = CAPNG_EFFECTIVE, + CAP_TYPE_PERMITTED = CAPNG_PERMITTED, + CAP_TYPE_INHERITABLE = CAPNG_INHERITABLE, + CAP_TYPE_BOUNDING = CAPNG_BOUNDING_SET, + CAP_TYPE_AMBIENT = (1 << 4) +}; + +/* + * Note: We are subject to https://bugzilla.redhat.com/show_bug.cgi?id=895105 + * and we will therefore have problems if new capabilities are added. Once + * that bug is fixed, I'll (Andy Lutomirski) submit a corresponding fix to + * setpriv. In the mean time, the code here tries to work reasonably well. + */ + +struct privctx { + unsigned int + nnp:1, /* no_new_privs */ + have_ruid:1, /* real uid */ + have_euid:1, /* effective uid */ + have_rgid:1, /* real gid */ + have_egid:1, /* effective gid */ + have_passwd:1, /* passwd entry */ + have_groups:1, /* add groups */ + keep_groups:1, /* keep groups */ + clear_groups:1, /* remove groups */ + init_groups:1, /* initialize groups */ + reset_env:1, /* reset environment */ + have_securebits:1; /* remove groups */ + + /* uids and gids */ + uid_t ruid, euid; + gid_t rgid, egid; + + /* real user passwd entry */ + struct passwd passwd; + + /* supplementary groups */ + size_t num_groups; + gid_t *groups; + + /* caps */ + const char *caps_to_inherit; + const char *ambient_caps; + const char *bounding_set; + + /* securebits */ + int securebits; + /* parent death signal (<0 clear, 0 nothing, >0 signal) */ + int pdeathsig; + + /* LSMs */ + const char *selinux_label; + const char *apparmor_profile; +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] <program> [<argument>...]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run a program with different privilege settings.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -d, --dump show current state (and do not exec)\n"), out); + fputs(_(" --nnp, --no-new-privs disallow granting new privileges\n"), out); + fputs(_(" --ambient-caps <caps,...> set ambient capabilities\n"), out); + fputs(_(" --inh-caps <caps,...> set inheritable capabilities\n"), out); + fputs(_(" --bounding-set <caps> set capability bounding set\n"), out); + fputs(_(" --ruid <uid|user> set real uid\n"), out); + fputs(_(" --euid <uid|user> set effective uid\n"), out); + fputs(_(" --rgid <gid|user> set real gid\n"), out); + fputs(_(" --egid <gid|group> set effective gid\n"), out); + fputs(_(" --reuid <uid|user> set real and effective uid\n"), out); + fputs(_(" --regid <gid|group> set real and effective gid\n"), out); + fputs(_(" --clear-groups clear supplementary groups\n"), out); + fputs(_(" --keep-groups keep supplementary groups\n"), out); + fputs(_(" --init-groups initialize supplementary groups\n"), out); + fputs(_(" --groups <group,...> set supplementary groups by UID or name\n"), out); + fputs(_(" --securebits <bits> set securebits\n"), out); + fputs(_(" --pdeathsig keep|clear|<signame>\n" + " set or clear parent death signal\n"), out); + fputs(_(" --selinux-label <label> set SELinux label\n"), out); + fputs(_(" --apparmor-profile <pr> set AppArmor profile\n"), out); + fputs(_(" --reset-env clear all environment and initialize\n" + " HOME, SHELL, USER, LOGNAME and PATH\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(29)); + fputs(USAGE_SEPARATOR, out); + fputs(_(" This tool can be dangerous. Read the manpage, and be careful.\n"), out); + printf(USAGE_MAN_TAIL("setpriv(1)")); + + exit(EXIT_SUCCESS); +} + +static int real_cap_last_cap(void) +{ + /* CAP_LAST_CAP is untrustworthy. */ + static int ret = -1; + int matched; + FILE *f; + + if (ret != -1) + return ret; + + f = fopen(_PATH_PROC_CAPLASTCAP, "r"); + if (!f) { + ret = CAP_LAST_CAP; /* guess */ + return ret; + } + + matched = fscanf(f, "%d", &ret); + fclose(f); + + if (matched != 1) + ret = CAP_LAST_CAP; /* guess */ + + return ret; +} + +static int has_cap(enum cap_type which, unsigned int i) +{ + switch (which) { + case CAP_TYPE_EFFECTIVE: + case CAP_TYPE_BOUNDING: + case CAP_TYPE_INHERITABLE: + case CAP_TYPE_PERMITTED: + return capng_have_capability((capng_type_t)which, i); + case CAP_TYPE_AMBIENT: + return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, + (unsigned long) i, 0UL, 0UL); + default: + warnx(_("invalid capability type")); + return -1; + } +} + +/* Returns the number of capabilities printed. */ +static int print_caps(FILE *f, enum cap_type which) +{ + int i, n = 0, max = real_cap_last_cap(); + + for (i = 0; i <= max; i++) { + int ret = has_cap(which, i); + + if (i == 0 && ret < 0) + return -1; + + if (ret == 1) { + const char *name = capng_capability_to_name(i); + if (n) + fputc(',', f); + if (name) + fputs(name, f); + else + /* cap-ng has very poor handling of + * CAP_LAST_CAP changes. This is the + * best we can do. */ + printf("cap_%d", i); + n++; + } + } + + return n; +} + +static void dump_one_secbit(int *first, int *bits, int bit, const char *name) +{ + if (*bits & bit) { + if (*first) + *first = 0; + else + printf(","); + fputs(name, stdout); + *bits &= ~bit; + } +} + +static void dump_securebits(void) +{ + int first = 1; + int bits = prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); + + if (bits < 0) { + warnx(_("getting process secure bits failed")); + return; + } + + printf(_("Securebits: ")); + + dump_one_secbit(&first, &bits, SECBIT_NOROOT, "noroot"); + dump_one_secbit(&first, &bits, SECBIT_NOROOT_LOCKED, "noroot_locked"); + dump_one_secbit(&first, &bits, SECBIT_NO_SETUID_FIXUP, + "no_setuid_fixup"); + dump_one_secbit(&first, &bits, SECBIT_NO_SETUID_FIXUP_LOCKED, + "no_setuid_fixup_locked"); + bits &= ~SECBIT_KEEP_CAPS; + dump_one_secbit(&first, &bits, SECBIT_KEEP_CAPS_LOCKED, + "keep_caps_locked"); + if (bits) { + if (first) + first = 0; + else + printf(","); + printf("0x%x", (unsigned)bits); + } + + if (first) + printf(_("[none]\n")); + else + printf("\n"); +} + +static void dump_label(const char *name) +{ + char buf[4097]; + ssize_t len; + int fd, e; + + fd = open(_PATH_PROC_ATTR_CURRENT, O_RDONLY); + if (fd == -1) { + warn(_("cannot open %s"), _PATH_PROC_ATTR_CURRENT); + return; + } + + len = read(fd, buf, sizeof(buf)); + e = errno; + close(fd); + if (len < 0) { + errno = e; + warn(_("cannot read %s"), name); + return; + } + if (sizeof(buf) - 1 <= (size_t)len) { + warnx(_("%s: too long"), name); + return; + } + + buf[len] = 0; + if (0 < len && buf[len - 1] == '\n') + buf[len - 1] = 0; + printf("%s: %s\n", name, buf); +} + +static void dump_groups(void) +{ + int n = getgroups(0, NULL); + gid_t *groups; + + if (n < 0) { + warn("getgroups failed"); + return; + } + + groups = xmalloc(n * sizeof(gid_t)); + n = getgroups(n, groups); + if (n < 0) { + free(groups); + warn("getgroups failed"); + return; + } + + printf(_("Supplementary groups: ")); + if (n == 0) + printf(_("[none]")); + else { + int i; + for (i = 0; i < n; i++) { + if (0 < i) + printf(","); + printf("%ld", (long)groups[i]); + } + } + printf("\n"); + free(groups); +} + +static void dump_pdeathsig(void) +{ + int pdeathsig; + + if (prctl(PR_GET_PDEATHSIG, &pdeathsig) != 0) { + warn(_("get pdeathsig failed")); + return; + } + + printf("Parent death signal: "); + if (pdeathsig && signum_to_signame(pdeathsig) != NULL) + printf("%s\n", signum_to_signame(pdeathsig)); + else if (pdeathsig) + printf("%d\n", pdeathsig); + else + printf("[none]\n"); +} + +static void dump(int dumplevel) +{ + int x; + uid_t ru, eu, su; + gid_t rg, eg, sg; + + if (getresuid(&ru, &eu, &su) == 0) { + printf(_("uid: %u\n"), ru); + printf(_("euid: %u\n"), eu); + /* Saved and fs uids always equal euid. */ + if (3 <= dumplevel) + printf(_("suid: %u\n"), su); + } else + warn(_("getresuid failed")); + + if (getresgid(&rg, &eg, &sg) == 0) { + printf("gid: %ld\n", (long)rg); + printf("egid: %ld\n", (long)eg); + /* Saved and fs gids always equal egid. */ + if (dumplevel >= 3) + printf("sgid: %ld\n", (long)sg); + } else + warn(_("getresgid failed")); + + dump_groups(); + + x = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (0 <= x) + printf("no_new_privs: %d\n", x); + else + warn("setting no_new_privs failed"); + + if (2 <= dumplevel) { + printf(_("Effective capabilities: ")); + if (print_caps(stdout, CAP_TYPE_EFFECTIVE) == 0) + printf(_("[none]")); + printf("\n"); + + printf(_("Permitted capabilities: ")); + if (print_caps(stdout, CAP_TYPE_PERMITTED) == 0) + printf(_("[none]")); + printf("\n"); + } + + printf(_("Inheritable capabilities: ")); + if (print_caps(stdout, CAP_TYPE_INHERITABLE) == 0) + printf(_("[none]")); + printf("\n"); + + printf(_("Ambient capabilities: ")); + x = print_caps(stdout, CAP_TYPE_AMBIENT); + if (x == 0) + printf(_("[none]")); + if (x < 0) + printf(_("[unsupported]")); + printf("\n"); + + printf(_("Capability bounding set: ")); + if (print_caps(stdout, CAP_TYPE_BOUNDING) == 0) + printf(_("[none]")); + printf("\n"); + + dump_securebits(); + dump_pdeathsig(); + + if (access(_PATH_SYS_SELINUX, F_OK) == 0) + dump_label(_("SELinux label")); + + if (access(_PATH_SYS_APPARMOR, F_OK) == 0) { + dump_label(_("AppArmor profile")); + } +} + +static void list_known_caps(void) +{ + int i, max = real_cap_last_cap(); + + for (i = 0; i <= max; i++) { + const char *name = capng_capability_to_name(i); + if (name) + printf("%s\n", name); + else + warnx(_("cap %d: libcap-ng is broken"), i); + } +} + +static void parse_groups(struct privctx *opts, const char *str) +{ + char *groups = xstrdup(str); + char *buf = groups; /* We'll reuse it */ + char *c; + size_t i = 0; + + opts->have_groups = 1; + opts->num_groups = 0; + while ((c = strsep(&groups, ","))) + opts->num_groups++; + + /* Start again */ + strcpy(buf, str); /* It's exactly the right length */ + groups = buf; + + opts->groups = xcalloc(opts->num_groups, sizeof(gid_t)); + while ((c = strsep(&groups, ","))) + opts->groups[i++] = get_group(c, _("Invalid supplementary group id")); + + free(groups); +} + +static void parse_pdeathsig(struct privctx *opts, const char *str) +{ + if (!strcmp(str, "keep")) { + if (prctl(PR_GET_PDEATHSIG, &opts->pdeathsig) != 0) + errx(SETPRIV_EXIT_PRIVERR, + _("failed to get parent death signal")); + } else if (!strcmp(str, "clear")) { + opts->pdeathsig = -1; + } else if ((opts->pdeathsig = signame_to_signum(str)) < 0) { + errx(EXIT_FAILURE, _("unknown signal: %s"), str); + } +} + +static void do_setresuid(const struct privctx *opts) +{ + uid_t ruid, euid, suid; + if (getresuid(&ruid, &euid, &suid) != 0) + err(SETPRIV_EXIT_PRIVERR, _("getresuid failed")); + if (opts->have_ruid) + ruid = opts->ruid; + if (opts->have_euid) + euid = opts->euid; + + /* Also copy effective to saved (for paranoia). */ + if (setresuid(ruid, euid, euid) != 0) + err(SETPRIV_EXIT_PRIVERR, _("setresuid failed")); +} + +static void do_setresgid(const struct privctx *opts) +{ + gid_t rgid, egid, sgid; + if (getresgid(&rgid, &egid, &sgid) != 0) + err(SETPRIV_EXIT_PRIVERR, _("getresgid failed")); + if (opts->have_rgid) + rgid = opts->rgid; + if (opts->have_egid) + egid = opts->egid; + + /* Also copy effective to saved (for paranoia). */ + if (setresgid(rgid, egid, egid) != 0) + err(SETPRIV_EXIT_PRIVERR, _("setresgid failed")); +} + +static void bump_cap(unsigned int cap) +{ + if (capng_have_capability(CAPNG_PERMITTED, cap)) + capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap); +} + +static int cap_update(capng_act_t action, + enum cap_type type, unsigned int cap) +{ + switch (type) { + case CAP_TYPE_EFFECTIVE: + case CAP_TYPE_BOUNDING: + case CAP_TYPE_INHERITABLE: + case CAP_TYPE_PERMITTED: + return capng_update(action, (capng_type_t) type, cap); + case CAP_TYPE_AMBIENT: + { + int ret; + + if (action == CAPNG_ADD) + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, + (unsigned long) cap, 0UL, 0UL); + else + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, + (unsigned long) cap, 0UL, 0UL); + + return ret; + } + default: + errx(EXIT_FAILURE, _("unsupported capability type")); + return -1; + } +} + +static void do_caps(enum cap_type type, const char *caps) +{ + char *my_caps = xstrdup(caps); + char *c; + + while ((c = strsep(&my_caps, ","))) { + capng_act_t action; + if (*c == '+') + action = CAPNG_ADD; + else if (*c == '-') + action = CAPNG_DROP; + else + errx(EXIT_FAILURE, _("bad capability string")); + + if (!strcmp(c + 1, "all")) { + int i; + /* It would be really bad if -all didn't drop all + * caps. It's better to just fail. */ + if (real_cap_last_cap() > CAP_LAST_CAP) + errx(SETPRIV_EXIT_PRIVERR, + _("libcap-ng is too old for \"all\" caps")); + for (i = 0; i <= CAP_LAST_CAP; i++) + cap_update(action, type, i); + } else { + int cap = capng_name_to_capability(c + 1); + if (0 <= cap) + cap_update(action, type, cap); + else if (sscanf(c + 1, "cap_%d", &cap) == 1 + && 0 <= cap && cap <= real_cap_last_cap()) + cap_update(action, type, cap); + else + errx(EXIT_FAILURE, + _("unknown capability \"%s\""), c + 1); + } + } + + free(my_caps); +} + +static void parse_securebits(struct privctx *opts, const char *arg) +{ + char *buf = xstrdup(arg); + char *c; + + opts->have_securebits = 1; + opts->securebits = prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); + if (opts->securebits < 0) + err(SETPRIV_EXIT_PRIVERR, _("getting process secure bits failed")); + + if (opts->securebits & ~(int)(SECBIT_NOROOT | + SECBIT_NOROOT_LOCKED | + SECBIT_NO_SETUID_FIXUP | + SECBIT_NO_SETUID_FIXUP_LOCKED | + SECBIT_KEEP_CAPS | + SECBIT_KEEP_CAPS_LOCKED)) + errx(SETPRIV_EXIT_PRIVERR, + _("unrecognized securebit set -- refusing to adjust")); + + while ((c = strsep(&buf, ","))) { + if (*c != '+' && *c != '-') + errx(EXIT_FAILURE, _("bad securebits string")); + + if (!strcmp(c + 1, "all")) { + if (*c == '-') + opts->securebits = 0; + else + errx(EXIT_FAILURE, + _("+all securebits is not allowed")); + } else { + int bit; + if (!strcmp(c + 1, "noroot")) + bit = SECBIT_NOROOT; + else if (!strcmp(c + 1, "noroot_locked")) + bit = SECBIT_NOROOT_LOCKED; + else if (!strcmp(c + 1, "no_setuid_fixup")) + bit = SECBIT_NO_SETUID_FIXUP; + else if (!strcmp(c + 1, "no_setuid_fixup_locked")) + bit = SECBIT_NO_SETUID_FIXUP_LOCKED; + else if (!strcmp(c + 1, "keep_caps")) + errx(EXIT_FAILURE, + _("adjusting keep_caps does not make sense")); + else if (!strcmp(c + 1, "keep_caps_locked")) + bit = SECBIT_KEEP_CAPS_LOCKED; /* sigh */ + else + errx(EXIT_FAILURE, _("unrecognized securebit")); + + if (*c == '+') + opts->securebits |= bit; + else + opts->securebits &= ~bit; + } + } + + opts->securebits |= SECBIT_KEEP_CAPS; /* We need it, and it's reset on exec */ + + free(buf); +} + +static void do_selinux_label(const char *label) +{ + int fd; + size_t len; + + if (access(_PATH_SYS_SELINUX, F_OK) != 0) + errx(SETPRIV_EXIT_PRIVERR, _("SELinux is not running")); + + fd = open(_PATH_PROC_ATTR_EXEC, O_RDWR); + if (fd == -1) + err(SETPRIV_EXIT_PRIVERR, + _("cannot open %s"), _PATH_PROC_ATTR_EXEC); + + len = strlen(label); + errno = 0; + if (write(fd, label, len) != (ssize_t) len) + err(SETPRIV_EXIT_PRIVERR, + _("write failed: %s"), _PATH_PROC_ATTR_EXEC); + + if (close(fd) != 0) + err(SETPRIV_EXIT_PRIVERR, + _("close failed: %s"), _PATH_PROC_ATTR_EXEC); +} + +static void do_apparmor_profile(const char *label) +{ + FILE *f; + + if (access(_PATH_SYS_APPARMOR, F_OK) != 0) + errx(SETPRIV_EXIT_PRIVERR, _("AppArmor is not running")); + + f = fopen(_PATH_PROC_ATTR_EXEC, "r+"); + if (!f) + err(SETPRIV_EXIT_PRIVERR, + _("cannot open %s"), _PATH_PROC_ATTR_EXEC); + + fprintf(f, "exec %s", label); + + if (close_stream(f) != 0) + err(SETPRIV_EXIT_PRIVERR, + _("write failed: %s"), _PATH_PROC_ATTR_EXEC); +} + + +static void do_reset_environ(struct passwd *pw) +{ + char *term = getenv("TERM"); + + if (term) + term = xstrdup(term); +#ifdef HAVE_CLEARENV + clearenv(); +#else + environ = NULL; +#endif + if (term) + xsetenv("TERM", term, 1); + + if (pw->pw_shell && *pw->pw_shell) + xsetenv("SHELL", pw->pw_shell, 1); + else + xsetenv("SHELL", DEFAULT_SHELL, 1); + + xsetenv("HOME", pw->pw_dir, 1); + xsetenv("USER", pw->pw_name, 1); + xsetenv("LOGNAME", pw->pw_name, 1); + + if (pw->pw_uid) + xsetenv("PATH", _PATH_DEFPATH, 1); + else + xsetenv("PATH", _PATH_DEFPATH_ROOT, 1); +} + +static uid_t get_user(const char *s, const char *err) +{ + struct passwd *pw; + long tmp; + pw = getpwnam(s); + if (pw) + return pw->pw_uid; + tmp = strtol_or_err(s, err); + return tmp; +} + +static gid_t get_group(const char *s, const char *err) +{ + struct group *gr; + long tmp; + gr = getgrnam(s); + if (gr) + return gr->gr_gid; + tmp = strtol_or_err(s, err); + return tmp; +} + +static struct passwd *get_passwd(const char *s, uid_t *uid, const char *err) +{ + struct passwd *pw; + long tmp; + pw = getpwnam(s); + if (pw) { + *uid = pw->pw_uid; + } else { + tmp = strtol_or_err(s, err); + *uid = tmp; + pw = getpwuid(*uid); + } + return pw; +} + +static struct passwd *passwd_copy(struct passwd *dst, const struct passwd *src) +{ + struct passwd *rv; + rv = memcpy(dst, src, sizeof(*dst)); + rv->pw_name = xstrdup(rv->pw_name); + rv->pw_passwd = xstrdup(rv->pw_passwd); + rv->pw_gecos = xstrdup(rv->pw_gecos); + rv->pw_dir = xstrdup(rv->pw_dir); + rv->pw_shell = xstrdup(rv->pw_shell); + return rv; +} + +int main(int argc, char **argv) +{ + enum { + NNP = CHAR_MAX + 1, + RUID, + EUID, + RGID, + EGID, + REUID, + REGID, + CLEAR_GROUPS, + KEEP_GROUPS, + INIT_GROUPS, + GROUPS, + INHCAPS, + AMBCAPS, + LISTCAPS, + CAPBSET, + SECUREBITS, + PDEATHSIG, + SELINUX_LABEL, + APPARMOR_PROFILE, + RESET_ENV + }; + + static const struct option longopts[] = { + { "dump", no_argument, NULL, 'd' }, + { "nnp", no_argument, NULL, NNP }, + { "no-new-privs", no_argument, NULL, NNP }, + { "inh-caps", required_argument, NULL, INHCAPS }, + { "ambient-caps", required_argument, NULL, AMBCAPS }, + { "list-caps", no_argument, NULL, LISTCAPS }, + { "ruid", required_argument, NULL, RUID }, + { "euid", required_argument, NULL, EUID }, + { "rgid", required_argument, NULL, RGID }, + { "egid", required_argument, NULL, EGID }, + { "reuid", required_argument, NULL, REUID }, + { "regid", required_argument, NULL, REGID }, + { "clear-groups", no_argument, NULL, CLEAR_GROUPS }, + { "keep-groups", no_argument, NULL, KEEP_GROUPS }, + { "init-groups", no_argument, NULL, INIT_GROUPS }, + { "groups", required_argument, NULL, GROUPS }, + { "bounding-set", required_argument, NULL, CAPBSET }, + { "securebits", required_argument, NULL, SECUREBITS }, + { "pdeathsig", required_argument, NULL, PDEATHSIG, }, + { "selinux-label", required_argument, NULL, SELINUX_LABEL }, + { "apparmor-profile", required_argument, NULL, APPARMOR_PROFILE }, + { "help", no_argument, NULL, 'h' }, + { "reset-env", no_argument, NULL, RESET_ENV, }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { + /* keep in same order with enum definitions */ + {CLEAR_GROUPS, KEEP_GROUPS, INIT_GROUPS, GROUPS}, + {0} + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + int c; + struct privctx opts; + struct passwd *pw = NULL; + int dumplevel = 0; + int total_opts = 0; + int list_caps = 0; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + memset(&opts, 0, sizeof(opts)); + + while ((c = getopt_long(argc, argv, "+dhV", longopts, NULL)) != -1) { + err_exclusive_options(c, longopts, excl, excl_st); + total_opts++; + switch (c) { + case 'd': + dumplevel++; + break; + case NNP: + if (opts.nnp) + errx(EXIT_FAILURE, + _("duplicate --no-new-privs option")); + opts.nnp = 1; + break; + case RUID: + if (opts.have_ruid) + errx(EXIT_FAILURE, _("duplicate ruid")); + opts.have_ruid = 1; + pw = get_passwd(optarg, &opts.ruid, _("failed to parse ruid")); + if (pw) { + passwd_copy(&opts.passwd, pw); + opts.have_passwd = 1; + } + break; + case EUID: + if (opts.have_euid) + errx(EXIT_FAILURE, _("duplicate euid")); + opts.have_euid = 1; + opts.euid = get_user(optarg, _("failed to parse euid")); + break; + case REUID: + if (opts.have_ruid || opts.have_euid) + errx(EXIT_FAILURE, _("duplicate ruid or euid")); + opts.have_ruid = opts.have_euid = 1; + pw = get_passwd(optarg, &opts.ruid, _("failed to parse reuid")); + opts.euid = opts.ruid; + if (pw) { + passwd_copy(&opts.passwd, pw); + opts.have_passwd = 1; + } + break; + case RGID: + if (opts.have_rgid) + errx(EXIT_FAILURE, _("duplicate rgid")); + opts.have_rgid = 1; + opts.rgid = get_group(optarg, _("failed to parse rgid")); + break; + case EGID: + if (opts.have_egid) + errx(EXIT_FAILURE, _("duplicate egid")); + opts.have_egid = 1; + opts.egid = get_group(optarg, _("failed to parse egid")); + break; + case REGID: + if (opts.have_rgid || opts.have_egid) + errx(EXIT_FAILURE, _("duplicate rgid or egid")); + opts.have_rgid = opts.have_egid = 1; + opts.rgid = opts.egid = get_group(optarg, _("failed to parse regid")); + break; + case CLEAR_GROUPS: + if (opts.clear_groups) + errx(EXIT_FAILURE, + _("duplicate --clear-groups option")); + opts.clear_groups = 1; + break; + case KEEP_GROUPS: + if (opts.keep_groups) + errx(EXIT_FAILURE, + _("duplicate --keep-groups option")); + opts.keep_groups = 1; + break; + case INIT_GROUPS: + if (opts.init_groups) + errx(EXIT_FAILURE, + _("duplicate --init-groups option")); + opts.init_groups = 1; + break; + case GROUPS: + if (opts.have_groups) + errx(EXIT_FAILURE, + _("duplicate --groups option")); + parse_groups(&opts, optarg); + break; + case PDEATHSIG: + if (opts.pdeathsig) + errx(EXIT_FAILURE, + _("duplicate --keep-pdeathsig option")); + parse_pdeathsig(&opts, optarg); + break; + case LISTCAPS: + list_caps = 1; + break; + case INHCAPS: + if (opts.caps_to_inherit) + errx(EXIT_FAILURE, + _("duplicate --inh-caps option")); + opts.caps_to_inherit = optarg; + break; + case AMBCAPS: + if (opts.ambient_caps) + errx(EXIT_FAILURE, + _("duplicate --ambient-caps option")); + opts.ambient_caps = optarg; + break; + case CAPBSET: + if (opts.bounding_set) + errx(EXIT_FAILURE, + _("duplicate --bounding-set option")); + opts.bounding_set = optarg; + break; + case SECUREBITS: + if (opts.have_securebits) + errx(EXIT_FAILURE, + _("duplicate --securebits option")); + parse_securebits(&opts, optarg); + break; + case SELINUX_LABEL: + if (opts.selinux_label) + errx(EXIT_FAILURE, + _("duplicate --selinux-label option")); + opts.selinux_label = optarg; + break; + case APPARMOR_PROFILE: + if (opts.apparmor_profile) + errx(EXIT_FAILURE, + _("duplicate --apparmor-profile option")); + opts.apparmor_profile = optarg; + break; + case RESET_ENV: + opts.reset_env = 1; + break; + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (dumplevel) { + if (total_opts != dumplevel || optind < argc) + errx(EXIT_FAILURE, + _("--dump is incompatible with all other options")); + dump(dumplevel); + return EXIT_SUCCESS; + } + + if (list_caps) { + if (total_opts != 1 || optind < argc) + errx(EXIT_FAILURE, + _("--list-caps must be specified alone")); + list_known_caps(); + return EXIT_SUCCESS; + } + + if (argc <= optind) + errx(EXIT_FAILURE, _("No program specified")); + + if ((opts.have_rgid || opts.have_egid) + && !opts.keep_groups && !opts.clear_groups && !opts.init_groups + && !opts.have_groups) + errx(EXIT_FAILURE, + _("--[re]gid requires --keep-groups, --clear-groups, --init-groups, or --groups")); + + if (opts.init_groups && !opts.have_ruid) + errx(EXIT_FAILURE, + _("--init-groups requires --ruid or --reuid")); + + if (opts.init_groups && !opts.have_passwd) + errx(EXIT_FAILURE, + _("uid %ld not found, --init-groups requires an user that " + "can be found on the system"), + (long) opts.ruid); + + if (opts.reset_env) { + if (opts.have_passwd) + /* pwd according to --ruid or --reuid */ + pw = &opts.passwd; + else + /* pwd for the current user */ + pw = getpwuid(getuid()); + do_reset_environ(pw); + } + + if (opts.nnp && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) + err(EXIT_FAILURE, _("disallow granting new privileges failed")); + + if (opts.selinux_label) + do_selinux_label(opts.selinux_label); + if (opts.apparmor_profile) + do_apparmor_profile(opts.apparmor_profile); + + if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) == -1) + err(EXIT_FAILURE, _("keep process capabilities failed")); + + /* We're going to want CAP_SETPCAP, CAP_SETUID, and CAP_SETGID if + * possible. */ + bump_cap(CAP_SETPCAP); + bump_cap(CAP_SETUID); + bump_cap(CAP_SETGID); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(SETPRIV_EXIT_PRIVERR, _("activate capabilities")); + + if (opts.have_ruid || opts.have_euid) { + do_setresuid(&opts); + /* KEEPCAPS doesn't work for the effective mask. */ + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(SETPRIV_EXIT_PRIVERR, _("reactivate capabilities")); + } + + if (opts.have_rgid || opts.have_egid) + do_setresgid(&opts); + + if (opts.have_groups) { + if (setgroups(opts.num_groups, opts.groups) != 0) + err(SETPRIV_EXIT_PRIVERR, _("setgroups failed")); + } else if (opts.init_groups) { + if (initgroups(opts.passwd.pw_name, opts.passwd.pw_gid) != 0) + err(SETPRIV_EXIT_PRIVERR, _("initgroups failed")); + } else if (opts.clear_groups) { + gid_t x = 0; + if (setgroups(0, &x) != 0) + err(SETPRIV_EXIT_PRIVERR, _("setgroups failed")); + } + + if (opts.have_securebits && prctl(PR_SET_SECUREBITS, opts.securebits, 0, 0, 0) != 0) + err(SETPRIV_EXIT_PRIVERR, _("set process securebits failed")); + + if (opts.bounding_set) { + do_caps(CAP_TYPE_BOUNDING, opts.bounding_set); + errno = EPERM; /* capng doesn't set errno if we're missing CAP_SETPCAP */ + if (capng_apply(CAPNG_SELECT_BOUNDS) != 0) + err(SETPRIV_EXIT_PRIVERR, _("apply bounding set")); + } + + if (opts.caps_to_inherit) { + do_caps(CAP_TYPE_INHERITABLE, opts.caps_to_inherit); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(SETPRIV_EXIT_PRIVERR, _("apply capabilities")); + } + + if (opts.ambient_caps) { + do_caps(CAP_TYPE_AMBIENT, opts.ambient_caps); + } + + /* Clear or set parent death signal */ + if (opts.pdeathsig && prctl(PR_SET_PDEATHSIG, opts.pdeathsig < 0 ? 0 : opts.pdeathsig) != 0) + err(SETPRIV_EXIT_PRIVERR, _("set parent death signal failed")); + + execvp(argv[optind], argv + optind); + errexec(argv[optind]); +} diff --git a/sys-utils/setsid.1 b/sys-utils/setsid.1 new file mode 100644 index 0000000..64f0555 --- /dev/null +++ b/sys-utils/setsid.1 @@ -0,0 +1,42 @@ +.\" Rick Sladkey <jrs@world.std.com> +.\" In the public domain. +.TH SETSID 1 "July 2014" "util-linux" "User Commands" +.SH NAME +setsid \- run a program in a new session +.SH SYNOPSIS +.B setsid +[options] +.I program +.RI [ arguments ] +.SH DESCRIPTION +.B setsid +runs a program in a new session. The command calls +.BR fork (2) +if already a process group leader. Otherwise, it executes a program in the +current process. This default behavior is possible to override by +the \fB\-\-fork\fR option. +.SH OPTIONS +.TP +.BR \-c , " \-\-ctty" +Set the controlling terminal to the current one. +.TP +.BR \-f , " \-\-fork" +Always create a new process. +.TP +.BR \-w , " \-\-wait" +Wait for the execution of the program to end, and return the exit value of +this program as the return value of +.BR setsid . +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH "SEE ALSO" +.BR setsid (2) +.SH AUTHOR +Rick Sladkey <jrs@world.std.com> +.SH AVAILABILITY +The setsid command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/setsid.c b/sys-utils/setsid.c new file mode 100644 index 0000000..8b4f83d --- /dev/null +++ b/sys-utils/setsid.c @@ -0,0 +1,123 @@ +/* + * setsid.c -- execute a command in a new session + * Rick Sladkey <jrs@world.std.com> + * In the public domain. + * + * 1999-02-22 Arkadiusz MiÅ›kiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + * + * 2001-01-18 John Fremlin <vii@penguinpowered.com> + * - fork in case we are process group leader + * + * 2008-08-20 Daniel Kahn Gillmor <dkg@fifthhorseman.net> + * - if forked, wait on child process and emit its return code. + */ + +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _( + " %s [options] <program> [arguments ...]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run a program in a new session.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -c, --ctty set the controlling terminal to the current one\n"), out); + fputs(_(" -f, --fork always fork\n"), out); + fputs(_(" -w, --wait wait program to exit, and use the same return\n"), out); + + printf(USAGE_HELP_OPTIONS(16)); + + printf(USAGE_MAN_TAIL("setsid(1)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int ch, forcefork = 0; + int ctty = 0; + pid_t pid; + int status = 0; + + static const struct option longopts[] = { + {"ctty", no_argument, NULL, 'c'}, + {"fork", no_argument, NULL, 'f'}, + {"wait", no_argument, NULL, 'w'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((ch = getopt_long(argc, argv, "+Vhcfw", longopts, NULL)) != -1) + switch (ch) { + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'c': + ctty=1; + break; + case 'f': + forcefork = 1; + break; + case 'w': + status = 1; + break; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + + if (argc - optind < 1) { + warnx(_("no command specified")); + errtryhelp(EXIT_FAILURE); + } + + if (forcefork || getpgrp() == getpid()) { + pid = fork(); + switch (pid) { + case -1: + err(EXIT_FAILURE, _("fork")); + case 0: + /* child */ + break; + default: + /* parent */ + if (!status) + return EXIT_SUCCESS; + if (wait(&status) != pid) + err(EXIT_FAILURE, "wait"); + if (WIFEXITED(status)) + return WEXITSTATUS(status); + err(status, _("child %d did not exit normally"), pid); + } + } + if (setsid() < 0) + /* cannot happen */ + err(EXIT_FAILURE, _("setsid failed")); + + if (ctty && ioctl(STDIN_FILENO, TIOCSCTTY, 1)) + err(EXIT_FAILURE, _("failed to set the controlling terminal")); + execvp(argv[optind], argv + optind); + errexec(argv[optind]); +} diff --git a/sys-utils/swapoff.8 b/sys-utils/swapoff.8 new file mode 100644 index 0000000..1a06b7e --- /dev/null +++ b/sys-utils/swapoff.8 @@ -0,0 +1 @@ +.so man8/swapon.8 diff --git a/sys-utils/swapoff.c b/sys-utils/swapoff.c new file mode 100644 index 0000000..0a3807f --- /dev/null +++ b/sys-utils/swapoff.c @@ -0,0 +1,253 @@ +#include <stdio.h> +#include <errno.h> +#include <getopt.h> + +#ifdef HAVE_SYS_SWAP_H +# include <sys/swap.h> +#endif + +#include "nls.h" +#include "c.h" +#include "xalloc.h" +#include "closestream.h" + +#include "swapprober.h" +#include "swapon-common.h" + +#if !defined(HAVE_SWAPOFF) && defined(SYS_swapoff) +# include <sys/syscall.h> +# define swapoff(path) syscall(SYS_swapoff, path) +#endif + +static int verbose; +static int all; + +#define QUIET 1 +#define CANONIC 1 + +/* + * This function works like mnt_resolve_tag(), but it's able to read UUID/LABEL + * from regular swap files too (according to entries in /proc/swaps). Note that + * mnt_resolve_tag() and mnt_resolve_spec() works with system visible block + * devices only. + */ +static char *swapoff_resolve_tag(const char *name, const char *value, + struct libmnt_cache *cache) +{ + char *path; + struct libmnt_table *tb; + struct libmnt_iter *itr; + struct libmnt_fs *fs; + + /* this is usual case for block devices (and it's really fast as it uses + * udev /dev/disk/by-* symlinks by default */ + path = mnt_resolve_tag(name, value, cache); + if (path) + return path; + + /* try regular files from /proc/swaps */ + tb = get_swaps(); + if (!tb) + return NULL; + + itr = mnt_new_iter(MNT_ITER_BACKWARD); + if (!itr) + err(EXIT_FAILURE, _("failed to initialize libmount iterator")); + + while (tb && mnt_table_next_fs(tb, itr, &fs) == 0) { + blkid_probe pr = NULL; + const char *src = mnt_fs_get_source(fs); + const char *type = mnt_fs_get_swaptype(fs); + const char *data = NULL; + + if (!src || !type || strcmp(type, "file") != 0) + continue; + pr = get_swap_prober(src); + if (!pr) + continue; + blkid_probe_lookup_value(pr, name, &data, NULL); + if (data && strcmp(data, value) == 0) + path = xstrdup(src); + blkid_free_probe(pr); + if (path) + break; + } + + mnt_free_iter(itr); + return path; +} + +static int do_swapoff(const char *orig_special, int quiet, int canonic) +{ + const char *special = orig_special; + + if (verbose) + printf(_("swapoff %s\n"), orig_special); + + if (!canonic) { + char *n, *v; + + special = mnt_resolve_spec(orig_special, mntcache); + if (!special && blkid_parse_tag_string(orig_special, &n, &v) == 0) { + special = swapoff_resolve_tag(n, v, mntcache); + free(n); + free(v); + } + if (!special) + return cannot_find(orig_special); + } + + if (swapoff(special) == 0) + return 0; /* success */ + + if (errno == EPERM) + errx(EXIT_FAILURE, _("Not superuser.")); + + if (!quiet || errno == ENOMEM) + warn(_("%s: swapoff failed"), orig_special); + + return -1; +} + +static int swapoff_by(const char *name, const char *value, int quiet) +{ + const char *special = swapoff_resolve_tag(name, value, mntcache); + return special ? do_swapoff(special, quiet, CANONIC) : cannot_find(value); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [<spec>]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Disable devices and files for paging and swapping.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all disable all swaps from /proc/swaps\n" + " -v, --verbose verbose mode\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + + fputs(_("\nThe <spec> parameter:\n" \ + " -L <label> LABEL of device to be used\n" \ + " -U <uuid> UUID of device to be used\n" \ + " LABEL=<label> LABEL of device to be used\n" \ + " UUID=<uuid> UUID of device to be used\n" \ + " <device> name of device to be used\n" \ + " <file> name of file to be used\n"), out); + + printf(USAGE_MAN_TAIL("swapoff(8)")); + exit(EXIT_SUCCESS); +} + +static int swapoff_all(void) +{ + int status = 0; + struct libmnt_table *tb; + struct libmnt_fs *fs; + struct libmnt_iter *itr = mnt_new_iter(MNT_ITER_BACKWARD); + + if (!itr) + err(EXIT_FAILURE, _("failed to initialize libmount iterator")); + + /* + * In case /proc/swaps exists, unswap stuff listed there. We are quiet + * but report errors in status. Errors might mean that /proc/swaps + * exists as ordinary file, not in procfs. do_swapoff() exits + * immediately on EPERM. + */ + tb = get_swaps(); + + while (tb && mnt_table_find_next_fs(tb, itr, match_swap, NULL, &fs) == 0) + status |= do_swapoff(mnt_fs_get_source(fs), QUIET, CANONIC); + + /* + * Unswap stuff mentioned in /etc/fstab. Probably it was unmounted + * already, so errors are not bad. Doing swapoff -a twice should not + * give error messages. + */ + tb = get_fstab(); + mnt_reset_iter(itr, MNT_ITER_FORWARD); + + while (tb && mnt_table_find_next_fs(tb, itr, match_swap, NULL, &fs) == 0) { + if (!is_active_swap(mnt_fs_get_source(fs))) + do_swapoff(mnt_fs_get_source(fs), QUIET, !CANONIC); + } + + mnt_free_iter(itr); + return status; +} + +int main(int argc, char *argv[]) +{ + int status = 0, c; + size_t i; + + static const struct option long_opts[] = { + { "all", no_argument, NULL, 'a' }, + { "help", no_argument, NULL, 'h' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "ahvVL:U:", + long_opts, NULL)) != -1) { + switch (c) { + case 'a': /* all */ + ++all; + break; + case 'h': /* help */ + usage(); + break; + case 'v': /* be chatty */ + ++verbose; + break; + case 'V': /* version */ + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'L': + add_label(optarg); + break; + case 'U': + add_uuid(optarg); + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + argv += optind; + + if (!all && !numof_labels() && !numof_uuids() && *argv == NULL) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + mnt_init_debug(0); + mntcache = mnt_new_cache(); + + for (i = 0; i < numof_labels(); i++) + status |= swapoff_by("LABEL", get_label(i), !QUIET); + + for (i = 0; i < numof_uuids(); i++) + status |= swapoff_by("UUID", get_uuid(i), !QUIET); + + while (*argv != NULL) + status |= do_swapoff(*argv++, !QUIET, !CANONIC); + + if (all) + status |= swapoff_all(); + + free_tables(); + mnt_unref_cache(mntcache); + + return status; +} diff --git a/sys-utils/swapon-common.c b/sys-utils/swapon-common.c new file mode 100644 index 0000000..dd1593d --- /dev/null +++ b/sys-utils/swapon-common.c @@ -0,0 +1,117 @@ + +#include "c.h" +#include "nls.h" +#include "xalloc.h" + +#include "swapon-common.h" + +/* + * content of /proc/swaps and /etc/fstab + */ +static struct libmnt_table *swaps, *fstab; + +struct libmnt_cache *mntcache; + +static int table_parser_errcb(struct libmnt_table *tb __attribute__((__unused__)), + const char *filename, int line) +{ + if (filename) + warnx(_("%s: parse error at line %d -- ignored"), filename, line); + return 1; +} + +struct libmnt_table *get_fstab(void) +{ + if (!fstab) { + fstab = mnt_new_table(); + if (!fstab) + return NULL; + mnt_table_set_parser_errcb(fstab, table_parser_errcb); + mnt_table_set_cache(fstab, mntcache); + if (mnt_table_parse_fstab(fstab, NULL) != 0) + return NULL; + } + + return fstab; +} + +struct libmnt_table *get_swaps(void) +{ + if (!swaps) { + swaps = mnt_new_table(); + if (!swaps) + return NULL; + mnt_table_set_cache(swaps, mntcache); + mnt_table_set_parser_errcb(swaps, table_parser_errcb); + if (mnt_table_parse_swaps(swaps, NULL) != 0) + return NULL; + } + + return swaps; +} + +void free_tables(void) +{ + mnt_unref_table(swaps); + mnt_unref_table(fstab); +} + +int match_swap(struct libmnt_fs *fs, void *data __attribute__((unused))) +{ + return fs && mnt_fs_is_swaparea(fs); +} + +int is_active_swap(const char *filename) +{ + struct libmnt_table *st = get_swaps(); + return st && mnt_table_find_source(st, filename, MNT_ITER_BACKWARD); +} + + +int cannot_find(const char *special) +{ + warnx(_("cannot find the device for %s"), special); + return -1; +} + +/* + * Lists with -L and -U option + */ +static const char **llist; +static size_t llct; +static const char **ulist; +static size_t ulct; + + +void add_label(const char *label) +{ + llist = xrealloc(llist, (++llct) * sizeof(char *)); + llist[llct - 1] = label; +} + +const char *get_label(size_t i) +{ + return i < llct ? llist[i] : NULL; +} + +size_t numof_labels(void) +{ + return llct; +} + +void add_uuid(const char *uuid) +{ + ulist = xrealloc(ulist, (++ulct) * sizeof(char *)); + ulist[ulct - 1] = uuid; +} + +const char *get_uuid(size_t i) +{ + return i < ulct ? ulist[i] : NULL; +} + +size_t numof_uuids(void) +{ + return ulct; +} + diff --git a/sys-utils/swapon-common.h b/sys-utils/swapon-common.h new file mode 100644 index 0000000..d1b679f --- /dev/null +++ b/sys-utils/swapon-common.h @@ -0,0 +1,25 @@ +#ifndef UTIL_LINUX_SWAPON_COMMON_H +#define UTIL_LINUX_SWAPON_COMMON_H + +#include <libmount.h> + +extern struct libmnt_cache *mntcache; + +extern struct libmnt_table *get_fstab(void); +extern struct libmnt_table *get_swaps(void); +extern void free_tables(void); + +extern int match_swap(struct libmnt_fs *fs, void *data); +extern int is_active_swap(const char *filename); + +extern int cannot_find(const char *special); + +extern void add_label(const char *label); +extern const char *get_label(size_t i); +extern size_t numof_labels(void); + +extern void add_uuid(const char *uuid); +extern const char *get_uuid(size_t i); +extern size_t numof_uuids(void); + +#endif /* UTIL_LINUX_SWAPON_COMMON_H */ diff --git a/sys-utils/swapon.8 b/sys-utils/swapon.8 new file mode 100644 index 0000000..510a15f --- /dev/null +++ b/sys-utils/swapon.8 @@ -0,0 +1,256 @@ +.\" Copyright (c) 1980, 1991 Regents of the University of California. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)swapon.8 6.3 (Berkeley) 3/16/91 +.\" +.TH SWAPON 8 "October 2014" "util-linux" "System Administration" +.SH NAME +swapon, swapoff \- enable/disable devices and files for paging and swapping +.SH SYNOPSIS +.B swapon +[options] +.RI [ specialfile ...] +.br +.B swapoff +.RB [ \-va ] +.RI [ specialfile ...] +.SH DESCRIPTION +.B swapon +is used to specify devices on which paging and swapping are to take place. + +The device or file used is given by the +.I specialfile +parameter. It may be of the form +.BI \-L " label" +or +.BI \-U " uuid" +to indicate a device by label or uuid. + +Calls to +.B swapon +normally occur in the system boot scripts making all swap devices available, so +that the paging and swapping activity is interleaved across several devices and +files. + +.B swapoff +disables swapping on the specified devices and files. +When the +.B \-a +flag is given, swapping is disabled on all known swap devices and files +(as found in +.I /proc/swaps +or +.IR /etc/fstab ). + +.SH OPTIONS +.TP +.BR \-a , " \-\-all" +All devices marked as ``swap'' in +.I /etc/fstab +are made available, except for those with the ``noauto'' option. +Devices that are already being used as swap are silently skipped. +.TP +.BR \-d , " \-\-discard" [ =\fIpolicy\fR] +Enable swap discards, if the swap backing device supports the discard or +trim operation. This may improve performance on some Solid State Devices, +but often it does not. The option allows one to select between two +available swap discard policies: +.B \-\-discard=once +to perform a single-time discard operation for the whole swap area at swapon; +or +.B \-\-discard=pages +to asynchronously discard freed swap pages before they are available for reuse. +If no policy is selected, the default behavior is to enable both discard types. +The +.I /etc/fstab +mount options +.BR discard , +.BR discard=once , +or +.B discard=pages +may also be used to enable discard flags. +.TP +.BR \-e , " \-\-ifexists" +Silently skip devices that do not exist. +The +.I /etc/fstab +mount option +.B nofail +may also be used to skip non-existing device. + +.TP +.BR \-f , " \-\-fixpgsz" +Reinitialize (exec mkswap) the swap space if its page size does not +match that of the current running kernel. +.BR mkswap (2) +initializes the whole device and does not check for bad blocks. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.TP +.BI \-L " label" +Use the partition that has the specified +.IR label . +(For this, access to +.I /proc/partitions +is needed.) +.TP +.BR \-o , " \-\-options " \fIopts\fP +Specify swap options by an fstab-compatible comma-separated string. +For example: +.RS +.RS +.sp +.B "swapon -o pri=1,discard=pages,nofail /dev/sda2" +.sp +.RE +The \fIopts\fP string is evaluated last and overrides all other +command line options. +.RE +.TP +.BR \-p , " \-\-priority " \fIpriority\fP +Specify the priority of the swap device. +.I priority +is a value between \-1 and 32767. Higher numbers indicate +higher priority. See +.BR swapon (2) +for a full description of swap priorities. Add +.BI pri= value +to the option field of +.I /etc/fstab +for use with +.BR "swapon -a" . +When no priority is defined, it defaults to \-1. +.TP +.BR \-s , " \-\-summary" +Display swap usage summary by device. Equivalent to "cat /proc/swaps". +This output format is DEPRECATED in favour +of \fB\-\-show\fR that provides better control on output data. +.TP +.BR \-\-show [ =\fIcolumn\fR ...] +Display a definable table of swap areas. See the +.B \-\-help +output for a list of available columns. +.TP +.B \-\-output\-all +Output all available columns. +.TP +.B \-\-noheadings +Do not print headings when displaying +.B \-\-show +output. +.TP +.B \-\-raw +Display +.B \-\-show +output without aligning table columns. +.TP +.B \-\-bytes +Display swap size in bytes in +.B \-\-show +output instead of in user-friendly units. +.TP +.BI \-U " uuid" +Use the partition that has the specified +.IR uuid . +.TP +.BR \-v , " \-\-verbose" +Be verbose. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.SH NOTES +You should not use \fBswapon\fR on a file with holes. +This can be seen in the system log as +.RS +.sp +.B "swapon: swapfile has holes." +.sp +.RE +The swap file implementation in the kernel expects to be able to write to the +file directly, without the assistance of the filesystem. This is a problem on +preallocated files (e.g. +.BR fallocate (1)) +on filesystems like \fBXFS\fR or \fBext4\fR, and on copy-on-write +filesystems like \fBbtrfs\fR. +.PP +It is recommended to use +.BR dd (1) +and +.I /dev/zero +to avoid holes on XFS and ext4. +.PP +.B swapon +may not work correctly when using a swap file with some versions of +\fBbtrfs\fR. This is due to btrfs being a copy-on-write filesystem: the +file location may not be static and corruption can result. Btrfs actively +disallows the use of swap files on its filesystems by refusing to map the file. +.PP +One possible workaround is to map the swap +file to a loopback device. This will allow the filesystem to determine the +mapping properly but may come with a performance impact. +.PP +Swap over \fBNFS\fR may not work. +.PP +.B swapon +automatically detects and rewrites a swap space signature with old software +suspend data (e.g. S1SUSPEND, S2SUSPEND, ...). The problem is that if we don't +do it, then we get data corruption the next time an attempt at unsuspending is +made. + +.SH ENVIRONMENT +.IP LIBMOUNT_DEBUG=all +enables libmount debug output. +.IP LIBBLKID_DEBUG=all +enables libblkid debug output. + +.SH SEE ALSO +.BR swapoff (2), +.BR swapon (2), +.BR fstab (5), +.BR init (8), +.BR mkswap (8), +.BR mount (8), +.BR rc (8) +.SH FILES +.br +.I /dev/sd?? +standard paging devices +.br +.I /etc/fstab +ascii filesystem description table +.SH HISTORY +The +.B swapon +command appeared in 4.0BSD. +.SH AVAILABILITY +The swapon command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/swapon.c b/sys-utils/swapon.c new file mode 100644 index 0000000..357dcb3 --- /dev/null +++ b/sys-utils/swapon.c @@ -0,0 +1,1017 @@ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <getopt.h> +#include <string.h> +#include <errno.h> +#include <sys/stat.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <fcntl.h> +#include <stdint.h> +#include <ctype.h> + +#include <libsmartcols.h> + +#include "c.h" +#include "nls.h" +#include "bitops.h" +#include "blkdev.h" +#include "pathnames.h" +#include "xalloc.h" +#include "strutils.h" +#include "optutils.h" +#include "closestream.h" + +#include "swapheader.h" +#include "swapprober.h" +#include "swapon-common.h" + +#ifdef HAVE_SYS_SWAP_H +# include <sys/swap.h> +#endif + +#ifndef SWAP_FLAG_DISCARD +# define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ +#endif + +#ifndef SWAP_FLAG_DISCARD_ONCE +# define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ +#endif + +#ifndef SWAP_FLAG_DISCARD_PAGES +# define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ +#endif + +#define SWAP_FLAGS_DISCARD_VALID (SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ + SWAP_FLAG_DISCARD_PAGES) + +#ifndef SWAP_FLAG_PREFER +# define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ +#endif + +#ifndef SWAP_FLAG_PRIO_MASK +# define SWAP_FLAG_PRIO_MASK 0x7fff +#endif + +#ifndef SWAP_FLAG_PRIO_SHIFT +# define SWAP_FLAG_PRIO_SHIFT 0 +#endif + +#if !defined(HAVE_SWAPON) && defined(SYS_swapon) +# include <sys/syscall.h> +# define swapon(path, flags) syscall(SYS_swapon, path, flags) +#endif + +#define MAX_PAGESIZE (64 * 1024) + +#ifndef UUID_STR_LEN +# define UUID_STR_LEN 37 +#endif + +enum { + SIG_SWAPSPACE = 1, + SIG_SWSUSPEND +}; + +/* column names */ +struct colinfo { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; +}; + +enum { + COL_PATH, + COL_TYPE, + COL_SIZE, + COL_USED, + COL_PRIO, + COL_UUID, + COL_LABEL +}; +static struct colinfo infos[] = { + [COL_PATH] = { "NAME", 0.20, 0, N_("device file or partition path") }, + [COL_TYPE] = { "TYPE", 0.20, SCOLS_FL_TRUNC, N_("type of the device")}, + [COL_SIZE] = { "SIZE", 0.20, SCOLS_FL_RIGHT, N_("size of the swap area")}, + [COL_USED] = { "USED", 0.20, SCOLS_FL_RIGHT, N_("bytes in use")}, + [COL_PRIO] = { "PRIO", 0.20, SCOLS_FL_RIGHT, N_("swap priority")}, + [COL_UUID] = { "UUID", 0.20, 0, N_("swap uuid")}, + [COL_LABEL] = { "LABEL", 0.20, 0, N_("swap label")}, +}; + + +/* swap area properties */ +struct swap_prop { + int discard; /* discard policy */ + int priority; /* non-prioritized swap by default */ + int no_fail; /* skip device if not exist */ +}; + +/* device description */ +struct swap_device { + const char *path; /* device or file to be turned on */ + const char *label; /* swap label */ + const char *uuid; /* unique identifier */ + unsigned int pagesize; +}; + +/* control struct */ +struct swapon_ctl { + int columns[ARRAY_SIZE(infos) * 2]; /* --show columns */ + int ncolumns; /* number of columns */ + + struct swap_prop props; /* global settings for all devices */ + + unsigned int + all:1, /* turn on all swap devices */ + bytes:1, /* display --show in bytes */ + fix_page_size:1, /* reinitialize page size */ + no_heading:1, /* toggle --show headers */ + raw:1, /* toggle --show alignment */ + show:1, /* display --show information */ + verbose:1; /* be chatty */ +}; + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + assert(name); + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static inline int get_column_id(const struct swapon_ctl *ctl, int num) +{ + assert(num < ctl->ncolumns); + assert(ctl->columns[num] < (int) ARRAY_SIZE(infos)); + + return ctl->columns[num]; +} + +static inline struct colinfo *get_column_info(const struct swapon_ctl *ctl, unsigned num) +{ + return &infos[get_column_id(ctl, num)]; +} + +static void add_scols_line(const struct swapon_ctl *ctl, struct libscols_table *table, struct libmnt_fs *fs) +{ + int i; + struct libscols_line *line; + blkid_probe pr = NULL; + const char *data; + + assert(table); + assert(fs); + + line = scols_table_new_line(table, NULL); + if (!line) + err(EXIT_FAILURE, _("failed to allocate output line")); + + data = mnt_fs_get_source(fs); + if (access(data, R_OK) == 0) + pr = get_swap_prober(data); + for (i = 0; i < ctl->ncolumns; i++) { + char *str = NULL; + off_t size; + + switch (get_column_id(ctl, i)) { + case COL_PATH: + xasprintf(&str, "%s", mnt_fs_get_source(fs)); + break; + case COL_TYPE: + xasprintf(&str, "%s", mnt_fs_get_swaptype(fs)); + break; + case COL_SIZE: + size = mnt_fs_get_size(fs); + size *= 1024; /* convert to bytes */ + if (ctl->bytes) + xasprintf(&str, "%jd", size); + else + str = size_to_human_string(SIZE_SUFFIX_1LETTER, size); + break; + case COL_USED: + size = mnt_fs_get_usedsize(fs); + size *= 1024; /* convert to bytes */ + if (ctl->bytes) + xasprintf(&str, "%jd", size); + else + str = size_to_human_string(SIZE_SUFFIX_1LETTER, size); + break; + case COL_PRIO: + xasprintf(&str, "%d", mnt_fs_get_priority(fs)); + break; + case COL_UUID: + if (pr && !blkid_probe_lookup_value(pr, "UUID", &data, NULL)) + xasprintf(&str, "%s", data); + break; + case COL_LABEL: + if (pr && !blkid_probe_lookup_value(pr, "LABEL", &data, NULL)) + xasprintf(&str, "%s", data); + break; + default: + break; + } + + if (str && scols_line_refer_data(line, i, str)) + err(EXIT_FAILURE, _("failed to add output data")); + } + if (pr) + blkid_free_probe(pr); + return; +} + +static int display_summary(void) +{ + struct libmnt_table *st = get_swaps(); + struct libmnt_iter *itr; + struct libmnt_fs *fs; + + if (!st) + return -1; + + if (mnt_table_is_empty(st)) + return 0; + + itr = mnt_new_iter(MNT_ITER_FORWARD); + if (!itr) + err(EXIT_FAILURE, _("failed to initialize libmount iterator")); + + printf(_("%s\t\t\t\tType\t\tSize\tUsed\tPriority\n"), _("Filename")); + + while (mnt_table_next_fs(st, itr, &fs) == 0) { + printf("%-39s\t%-8s\t%jd\t%jd\t%d\n", + mnt_fs_get_source(fs), + mnt_fs_get_swaptype(fs), + mnt_fs_get_size(fs), + mnt_fs_get_usedsize(fs), + mnt_fs_get_priority(fs)); + } + + mnt_free_iter(itr); + return 0; +} + +static int show_table(struct swapon_ctl *ctl) +{ + struct libmnt_table *st = get_swaps(); + struct libmnt_iter *itr = NULL; + struct libmnt_fs *fs; + int i; + struct libscols_table *table = NULL; + + if (!st) + return -1; + + itr = mnt_new_iter(MNT_ITER_FORWARD); + if (!itr) + err(EXIT_FAILURE, _("failed to initialize libmount iterator")); + + scols_init_debug(0); + + table = scols_new_table(); + if (!table) + err(EXIT_FAILURE, _("failed to allocate output table")); + + scols_table_enable_raw(table, ctl->raw); + scols_table_enable_noheadings(table, ctl->no_heading); + + for (i = 0; i < ctl->ncolumns; i++) { + struct colinfo *col = get_column_info(ctl, i); + + if (!scols_table_new_column(table, col->name, col->whint, col->flags)) + err(EXIT_FAILURE, _("failed to allocate output column")); + } + + while (mnt_table_next_fs(st, itr, &fs) == 0) + add_scols_line(ctl, table, fs); + + scols_print_table(table); + scols_unref_table(table); + mnt_free_iter(itr); + return 0; +} + +/* calls mkswap */ +static int swap_reinitialize(struct swap_device *dev) +{ + pid_t pid; + int status, ret; + char const *cmd[7]; + int idx=0; + + assert(dev); + assert(dev->path); + + warnx(_("%s: reinitializing the swap."), dev->path); + + switch ((pid=fork())) { + case -1: /* fork error */ + warn(_("fork failed")); + return -1; + + case 0: /* child */ + if (geteuid() != getuid()) { + /* in case someone uses swapon as setuid binary */ + if (setgid(getgid()) < 0) + exit(EXIT_FAILURE); + if (setuid(getuid()) < 0) + exit(EXIT_FAILURE); + } + + cmd[idx++] = "mkswap"; + if (dev->label) { + cmd[idx++] = "-L"; + cmd[idx++] = dev->label; + } + if (dev->uuid) { + cmd[idx++] = "-U"; + cmd[idx++] = dev->uuid; + } + cmd[idx++] = dev->path; + cmd[idx++] = NULL; + execvp(cmd[0], (char * const *) cmd); + errexec(cmd[0]); + + default: /* parent */ + do { + ret = waitpid(pid, &status, 0); + } while (ret == -1 && errno == EINTR); + + if (ret < 0) { + warn(_("waitpid failed")); + return -1; + } + + /* mkswap returns: 0=suss, >0 error */ + if (WIFEXITED(status) && WEXITSTATUS(status)==0) + return 0; /* ok */ + break; + } + return -1; /* error */ +} + +/* Replaces unwanted SWSUSPEND signature with swap signature */ +static int swap_rewrite_signature(const struct swap_device *dev) +{ + int fd, rc = -1; + + assert(dev); + assert(dev->path); + assert(dev->pagesize); + + fd = open(dev->path, O_WRONLY); + if (fd == -1) { + warn(_("cannot open %s"), dev->path); + return -1; + } + + if (lseek(fd, dev->pagesize - SWAP_SIGNATURE_SZ, SEEK_SET) < 0) { + warn(_("%s: lseek failed"), dev->path); + goto err; + } + + if (write(fd, (void *) SWAP_SIGNATURE, + SWAP_SIGNATURE_SZ) != SWAP_SIGNATURE_SZ) { + warn(_("%s: write signature failed"), dev->path); + goto err; + } + + rc = 0; +err: + if (close_fd(fd) != 0) { + warn(_("write failed: %s"), dev->path); + rc = -1; + } + return rc; +} + +static int swap_detect_signature(const char *buf, int *sig) +{ + assert(buf); + assert(sig); + + if (memcmp(buf, SWAP_SIGNATURE, SWAP_SIGNATURE_SZ) == 0) + *sig = SIG_SWAPSPACE; + + else if (memcmp(buf, "S1SUSPEND", 9) == 0 || + memcmp(buf, "S2SUSPEND", 9) == 0 || + memcmp(buf, "ULSUSPEND", 9) == 0 || + memcmp(buf, "\xed\xc3\x02\xe9\x98\x56\xe5\x0c", 8) == 0 || + memcmp(buf, "LINHIB0001", 10) == 0) + *sig = SIG_SWSUSPEND; + else + return 0; + + return 1; +} + +static char *swap_get_header(int fd, int *sig, unsigned int *pagesize) +{ + char *buf; + ssize_t datasz; + unsigned int page; + + assert(sig); + assert(pagesize); + + *pagesize = 0; + *sig = 0; + + buf = xmalloc(MAX_PAGESIZE); + + datasz = read(fd, buf, MAX_PAGESIZE); + if (datasz == (ssize_t) -1) + goto err; + + for (page = 0x1000; page <= MAX_PAGESIZE; page <<= 1) { + /* skip 32k pagesize since this does not seem to + * be supported */ + if (page == 0x8000) + continue; + /* the smallest swap area is PAGE_SIZE*10, it means + * 40k, that's less than MAX_PAGESIZE */ + if (datasz < 0 || (size_t) datasz < (page - SWAP_SIGNATURE_SZ)) + break; + if (swap_detect_signature(buf + page - SWAP_SIGNATURE_SZ, sig)) { + *pagesize = page; + break; + } + } + + if (*pagesize) + return buf; +err: + free(buf); + return NULL; +} + +/* returns real size of swap space */ +static unsigned long long swap_get_size(const struct swap_device *dev, + const char *hdr) +{ + unsigned int last_page = 0; + const unsigned int swap_version = SWAP_VERSION; + const struct swap_header_v1_2 *s; + + assert(dev); + assert(dev->pagesize > 0); + + s = (const struct swap_header_v1_2 *) hdr; + + if (s->version == swap_version) + last_page = s->last_page; + else if (swab32(s->version) == swap_version) + last_page = swab32(s->last_page); + + return ((unsigned long long) last_page + 1) * dev->pagesize; +} + +static void swap_get_info(struct swap_device *dev, const char *hdr) +{ + const struct swap_header_v1_2 *s = (const struct swap_header_v1_2 *) hdr; + + assert(dev); + + if (s && *s->volume_name) + dev->label = xstrdup(s->volume_name); + + if (s && *s->uuid) { + const unsigned char *u = s->uuid; + char str[UUID_STR_LEN]; + + snprintf(str, sizeof(str), + "%02x%02x%02x%02x-" + "%02x%02x-%02x%02x-" + "%02x%02x-%02x%02x%02x%02x%02x%02x", + u[0], u[1], u[2], u[3], + u[4], u[5], u[6], u[7], + u[8], u[9], u[10], u[11], u[12], u[13], u[14], u[15]); + dev->uuid = xstrdup(str); + } +} + +static int swapon_checks(const struct swapon_ctl *ctl, struct swap_device *dev) +{ + struct stat st; + int fd, sig; + char *hdr = NULL; + unsigned long long devsize = 0; + int permMask; + + assert(ctl); + assert(dev); + assert(dev->path); + + fd = open(dev->path, O_RDONLY); + if (fd == -1) { + warn(_("cannot open %s"), dev->path); + goto err; + } + + if (fstat(fd, &st) < 0) { + warn(_("stat of %s failed"), dev->path); + goto err; + } + + permMask = S_ISBLK(st.st_mode) ? 07007 : 07077; + if ((st.st_mode & permMask) != 0) + warnx(_("%s: insecure permissions %04o, %04o suggested."), + dev->path, st.st_mode & 07777, + ~permMask & 0666); + + if (S_ISREG(st.st_mode) && st.st_uid != 0) + warnx(_("%s: insecure file owner %d, 0 (root) suggested."), + dev->path, st.st_uid); + + /* test for holes by LBT */ + if (S_ISREG(st.st_mode)) { + if (st.st_blocks * 512 < st.st_size) { + warnx(_("%s: skipping - it appears to have holes."), + dev->path); + goto err; + } + devsize = st.st_size; + } + + if (S_ISBLK(st.st_mode) && blkdev_get_size(fd, &devsize)) { + warnx(_("%s: get size failed"), dev->path); + goto err; + } + + hdr = swap_get_header(fd, &sig, &dev->pagesize); + if (!hdr) { + warnx(_("%s: read swap header failed"), dev->path); + goto err; + } + + if (ctl->verbose) + warnx(_("%s: found signature [pagesize=%d, signature=%s]"), + dev->path, + dev->pagesize, + sig == SIG_SWAPSPACE ? "swap" : + sig == SIG_SWSUSPEND ? "suspend" : "unknown"); + + if (sig == SIG_SWAPSPACE && dev->pagesize) { + unsigned long long swapsize = swap_get_size(dev, hdr); + int syspg = getpagesize(); + + if (ctl->verbose) + warnx(_("%s: pagesize=%d, swapsize=%llu, devsize=%llu"), + dev->path, dev->pagesize, swapsize, devsize); + + if (swapsize > devsize) { + if (ctl->verbose) + warnx(_("%s: last_page 0x%08llx is larger" + " than actual size of swapspace"), + dev->path, swapsize); + + } else if (syspg < 0 || (unsigned int) syspg != dev->pagesize) { + if (ctl->fix_page_size) { + int rc; + + swap_get_info(dev, hdr); + + warnx(_("%s: swap format pagesize does not match."), + dev->path); + rc = swap_reinitialize(dev); + if (rc < 0) + goto err; + } else + warnx(_("%s: swap format pagesize does not match. " + "(Use --fixpgsz to reinitialize it.)"), + dev->path); + } + } else if (sig == SIG_SWSUSPEND) { + /* We have to reinitialize swap with old (=useless) software suspend + * data. The problem is that if we don't do it, then we get data + * corruption the next time an attempt at unsuspending is made. + */ + warnx(_("%s: software suspend data detected. " + "Rewriting the swap signature."), + dev->path); + if (swap_rewrite_signature(dev) < 0) + goto err; + } + + free(hdr); + close(fd); + return 0; +err: + if (fd != -1) + close(fd); + free(hdr); + return -1; +} + +static int do_swapon(const struct swapon_ctl *ctl, + const struct swap_prop *prop, + const char *spec, + int canonic) +{ + struct swap_device dev = { .path = NULL }; + int status; + int flags = 0; + int priority; + + assert(ctl); + assert(prop); + + if (!canonic) { + dev.path = mnt_resolve_spec(spec, mntcache); + if (!dev.path) + return cannot_find(spec); + } else + dev.path = spec; + + priority = prop->priority; + + if (swapon_checks(ctl, &dev)) + return -1; + +#ifdef SWAP_FLAG_PREFER + if (priority >= 0) { + if (priority > SWAP_FLAG_PRIO_MASK) + priority = SWAP_FLAG_PRIO_MASK; + + flags = SWAP_FLAG_PREFER + | ((priority & SWAP_FLAG_PRIO_MASK) + << SWAP_FLAG_PRIO_SHIFT); + } +#endif + /* + * Validate the discard flags passed and set them + * accordingly before calling sys_swapon. + */ + if (prop->discard && !(prop->discard & ~SWAP_FLAGS_DISCARD_VALID)) { + /* + * If we get here with both discard policy flags set, + * we just need to tell the kernel to enable discards + * and it will do correctly, just as we expect. + */ + if ((prop->discard & SWAP_FLAG_DISCARD_ONCE) && + (prop->discard & SWAP_FLAG_DISCARD_PAGES)) + flags |= SWAP_FLAG_DISCARD; + else + flags |= prop->discard; + } + + if (ctl->verbose) + printf(_("swapon %s\n"), dev.path); + + status = swapon(dev.path, flags); + if (status < 0) + warn(_("%s: swapon failed"), dev.path); + + return status; +} + +static int swapon_by_label(struct swapon_ctl *ctl, const char *label) +{ + char *device = mnt_resolve_tag("LABEL", label, mntcache); + return device ? do_swapon(ctl, &ctl->props, device, TRUE) : cannot_find(label); +} + +static int swapon_by_uuid(struct swapon_ctl *ctl, const char *uuid) +{ + char *device = mnt_resolve_tag("UUID", uuid, mntcache); + return device ? do_swapon(ctl, &ctl->props, device, TRUE) : cannot_find(uuid); +} + +/* -o <options> or fstab */ +static int parse_options(struct swap_prop *props, const char *options) +{ + char *arg = NULL; + size_t argsz = 0; + + assert(props); + assert(options); + + if (mnt_optstr_get_option(options, "nofail", NULL, NULL) == 0) + props->no_fail = 1; + + if (mnt_optstr_get_option(options, "discard", &arg, &argsz) == 0) { + props->discard |= SWAP_FLAG_DISCARD; + + if (arg) { + /* only single-time discards are wanted */ + if (strncmp(arg, "once", argsz) == 0) + props->discard |= SWAP_FLAG_DISCARD_ONCE; + + /* do discard for every released swap page */ + if (strncmp(arg, "pages", argsz) == 0) + props->discard |= SWAP_FLAG_DISCARD_PAGES; + } + } + + arg = NULL; + if (mnt_optstr_get_option(options, "pri", &arg, NULL) == 0 && arg) + props->priority = atoi(arg); + + return 0; +} + + +static int swapon_all(struct swapon_ctl *ctl) +{ + struct libmnt_table *tb = get_fstab(); + struct libmnt_iter *itr; + struct libmnt_fs *fs; + int status = 0; + + if (!tb) + err(EXIT_FAILURE, _("failed to parse %s"), mnt_get_fstab_path()); + + itr = mnt_new_iter(MNT_ITER_FORWARD); + if (!itr) + err(EXIT_FAILURE, _("failed to initialize libmount iterator")); + + while (mnt_table_find_next_fs(tb, itr, match_swap, NULL, &fs) == 0) { + /* defaults */ + const char *opts; + const char *device; + struct swap_prop prop; /* per device setting */ + + if (mnt_fs_get_option(fs, "noauto", NULL, NULL) == 0) { + if (ctl->verbose) + warnx(_("%s: noauto option -- ignored"), mnt_fs_get_source(fs)); + continue; + } + + /* default setting */ + prop = ctl->props; + + /* overwrite default by setting from fstab */ + opts = mnt_fs_get_options(fs); + if (opts) + parse_options(&prop, opts); + + /* convert LABEL=, UUID= etc. from fstab to device name */ + device = mnt_resolve_spec(mnt_fs_get_source(fs), mntcache); + if (!device) { + if (!prop.no_fail) + status |= cannot_find(mnt_fs_get_source(fs)); + continue; + } + + if (is_active_swap(device)) { + if (ctl->verbose) + warnx(_("%s: already active -- ignored"), device); + continue; + } + + if (prop.no_fail && access(device, R_OK) != 0) { + if (ctl->verbose) + warnx(_("%s: inaccessible -- ignored"), device); + continue; + } + + /* swapon */ + status |= do_swapon(ctl, &prop, device, TRUE); + } + + mnt_free_iter(itr); + return status; +} + + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [<spec>]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Enable devices and files for paging and swapping.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all enable all swaps from /etc/fstab\n"), out); + fputs(_(" -d, --discard[=<policy>] enable swap discards, if supported by device\n"), out); + fputs(_(" -e, --ifexists silently skip devices that do not exist\n"), out); + fputs(_(" -f, --fixpgsz reinitialize the swap space if necessary\n"), out); + fputs(_(" -o, --options <list> comma-separated list of swap options\n"), out); + fputs(_(" -p, --priority <prio> specify the priority of the swap device\n"), out); + fputs(_(" -s, --summary display summary about used swap devices (DEPRECATED)\n"), out); + fputs(_(" --show[=<columns>] display summary in definable table\n"), out); + fputs(_(" --noheadings don't print table heading (with --show)\n"), out); + fputs(_(" --raw use the raw output format (with --show)\n"), out); + fputs(_(" --bytes display swap size in bytes in --show output\n"), out); + fputs(_(" -v, --verbose verbose mode\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(26)); + + fputs(_("\nThe <spec> parameter:\n" \ + " -L <label> synonym for LABEL=<label>\n" + " -U <uuid> synonym for UUID=<uuid>\n" + " LABEL=<label> specifies device by swap area label\n" + " UUID=<uuid> specifies device by swap area UUID\n" + " PARTLABEL=<label> specifies device by partition label\n" + " PARTUUID=<uuid> specifies device by partition UUID\n" + " <device> name of device to be used\n" + " <file> name of file to be used\n"), out); + + fputs(_("\nAvailable discard policy types (for --discard):\n" + " once : only single-time area discards are issued\n" + " pages : freed pages are discarded before they are reused\n" + "If no policy is selected, both discard types are enabled (default).\n"), out); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %-5s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("swapon(8)")); + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + int status = 0, c; + size_t i; + char *options = NULL; + + enum { + BYTES_OPTION = CHAR_MAX + 1, + NOHEADINGS_OPTION, + RAW_OPTION, + SHOW_OPTION, + OPT_LIST_TYPES + }; + + static const struct option long_opts[] = { + { "priority", required_argument, NULL, 'p' }, + { "discard", optional_argument, NULL, 'd' }, + { "ifexists", no_argument, NULL, 'e' }, + { "options", optional_argument, NULL, 'o' }, + { "summary", no_argument, NULL, 's' }, + { "fixpgsz", no_argument, NULL, 'f' }, + { "all", no_argument, NULL, 'a' }, + { "help", no_argument, NULL, 'h' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { "show", optional_argument, NULL, SHOW_OPTION }, + { "output-all", no_argument, NULL, OPT_LIST_TYPES }, + { "noheadings", no_argument, NULL, NOHEADINGS_OPTION }, + { "raw", no_argument, NULL, RAW_OPTION }, + { "bytes", no_argument, NULL, BYTES_OPTION }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'a','o','s', SHOW_OPTION }, + { 'a','o', BYTES_OPTION }, + { 'a','o', NOHEADINGS_OPTION }, + { 'a','o', RAW_OPTION }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + struct swapon_ctl ctl; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + memset(&ctl, 0, sizeof(struct swapon_ctl)); + ctl.props.priority = -1; + + mnt_init_debug(0); + mntcache = mnt_new_cache(); + + while ((c = getopt_long(argc, argv, "ahd::efo:p:svVL:U:", + long_opts, NULL)) != -1) { + + err_exclusive_options(c, long_opts, excl, excl_st); + + switch (c) { + case 'a': /* all */ + ctl.all = 1; + break; + case 'h': /* help */ + usage(); + break; + case 'o': + options = optarg; + break; + case 'p': /* priority */ + ctl.props.priority = strtos16_or_err(optarg, + _("failed to parse priority")); + break; + case 'L': + add_label(optarg); + break; + case 'U': + add_uuid(optarg); + break; + case 'd': + ctl.props.discard |= SWAP_FLAG_DISCARD; + if (optarg) { + if (*optarg == '=') + optarg++; + + if (strcmp(optarg, "once") == 0) + ctl.props.discard |= SWAP_FLAG_DISCARD_ONCE; + else if (strcmp(optarg, "pages") == 0) + ctl.props.discard |= SWAP_FLAG_DISCARD_PAGES; + else + errx(EXIT_FAILURE, _("unsupported discard policy: %s"), optarg); + } + break; + case 'e': /* ifexists */ + ctl.props.no_fail = 1; + break; + case 'f': + ctl.fix_page_size = 1; + break; + case 's': /* status report */ + status = display_summary(); + return status; + case 'v': /* be chatty */ + ctl.verbose = 1; + break; + case SHOW_OPTION: + if (optarg) { + ctl.ncolumns = string_to_idarray(optarg, + ctl.columns, + ARRAY_SIZE(ctl.columns), + column_name_to_id); + if (ctl.ncolumns < 0) + return EXIT_FAILURE; + } + ctl.show = 1; + break; + case OPT_LIST_TYPES: + for (ctl.ncolumns = 0; (size_t)ctl.ncolumns < ARRAY_SIZE(infos); ctl.ncolumns++) + ctl.columns[ctl.ncolumns] = ctl.ncolumns; + break; + case NOHEADINGS_OPTION: + ctl.no_heading = 1; + break; + case RAW_OPTION: + ctl.raw = 1; + break; + case BYTES_OPTION: + ctl.bytes = 1; + break; + case 'V': /* version */ + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 0: + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + argv += optind; + + if (ctl.show || (!ctl.all && !numof_labels() && !numof_uuids() && *argv == NULL)) { + if (!ctl.ncolumns) { + /* default columns */ + ctl.columns[ctl.ncolumns++] = COL_PATH; + ctl.columns[ctl.ncolumns++] = COL_TYPE; + ctl.columns[ctl.ncolumns++] = COL_SIZE; + ctl.columns[ctl.ncolumns++] = COL_USED; + ctl.columns[ctl.ncolumns++] = COL_PRIO; + } + status = show_table(&ctl); + return status; + } + + if (ctl.props.no_fail && !ctl.all) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + if (ctl.all) + status |= swapon_all(&ctl); + + if (options) + parse_options(&ctl.props, options); + + for (i = 0; i < numof_labels(); i++) + status |= swapon_by_label(&ctl, get_label(i)); + + for (i = 0; i < numof_uuids(); i++) + status |= swapon_by_uuid(&ctl, get_uuid(i)); + + while (*argv != NULL) + status |= do_swapon(&ctl, &ctl.props, *argv++, FALSE); + + free_tables(); + mnt_unref_cache(mntcache); + + return status; +} diff --git a/sys-utils/switch_root.8 b/sys-utils/switch_root.8 new file mode 100644 index 0000000..4e162b3 --- /dev/null +++ b/sys-utils/switch_root.8 @@ -0,0 +1,61 @@ +.\" Karel Zak <kzak@redhat.com> +.TH SWITCH_ROOT 8 "June 2009" "util-linux" "System Administration" +.SH NAME +switch_root \- switch to another filesystem as the root of the mount tree +.SH SYNOPSIS +.B switch_root +.RB [ \-hV ] +.LP +.B switch_root +.I newroot +.I init +.RI [ arg ...] +.SH DESCRIPTION +.B switch_root +moves already mounted /proc, /dev, /sys and /run to +.I newroot +and makes +.I newroot +the new root filesystem and starts +.I init +process. + +.B WARNING: switch_root removes recursively all files and directories on the current root filesystem. + +.SH OPTIONS +.IP "\fB\-h, \-\-help\fP" +Display help text and exit. +.IP "\fB\-V, \-\-version\fP" +Display version information and exit. + +.SH RETURN VALUE +.B switch_root +returns 0 on success and 1 on failure. + +.SH NOTES +switch_root will fail to function if +.B newroot +is not the root of a mount. If you want to switch root into a directory that +does not meet this requirement then you can first use a bind-mounting trick to +turn any directory into a mount point: +.sp +.nf +.RS +mount --bind $DIR $DIR +.RE +.fi + +.SH "SEE ALSO" +.BR chroot (2), +.BR init (8), +.BR mkinitrd (8), +.BR mount (8) +.SH AUTHORS +.nf +Peter Jones <pjones@redhat.com> +Jeremy Katz <katzj@redhat.com> +Karel Zak <kzak@redhat.com> +.fi +.SH AVAILABILITY +The switch_root command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/switch_root.c b/sys-utils/switch_root.c new file mode 100644 index 0000000..a85ce24 --- /dev/null +++ b/sys-utils/switch_root.c @@ -0,0 +1,263 @@ +/* + * switchroot.c - switch to new root directory and start init. + * + * Copyright 2002-2009 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Authors: + * Peter Jones <pjones@redhat.com> + * Jeremy Katz <katzj@redhat.com> + */ +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/statfs.h> +#include <sys/param.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> +#include <dirent.h> +#include <getopt.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" +#include "statfs_magic.h" + +#ifndef MS_MOVE +#define MS_MOVE 8192 +#endif + +#ifndef MNT_DETACH +#define MNT_DETACH 0x00000002 /* Just detach from the tree */ +#endif + +/* remove all files/directories below dirName -- don't cross mountpoints */ +static int recursiveRemove(int fd) +{ + struct stat rb; + DIR *dir; + int rc = -1; + int dfd; + + if (!(dir = fdopendir(fd))) { + warn(_("failed to open directory")); + goto done; + } + + /* fdopendir() precludes us from continuing to use the input fd */ + dfd = dirfd(dir); + + if (fstat(dfd, &rb)) { + warn(_("stat failed")); + goto done; + } + + while(1) { + struct dirent *d; + int isdir = 0; + + errno = 0; + if (!(d = readdir(dir))) { + if (errno) { + warn(_("failed to read directory")); + goto done; + } + break; /* end of directory */ + } + + if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) + continue; +#ifdef _DIRENT_HAVE_D_TYPE + if (d->d_type == DT_DIR || d->d_type == DT_UNKNOWN) +#endif + { + struct stat sb; + + if (fstatat(dfd, d->d_name, &sb, AT_SYMLINK_NOFOLLOW)) { + warn(_("stat of %s failed"), d->d_name); + continue; + } + + /* skip if device is not the same */ + if (sb.st_dev != rb.st_dev) + continue; + + /* remove subdirectories */ + if (S_ISDIR(sb.st_mode)) { + int cfd; + + cfd = openat(dfd, d->d_name, O_RDONLY); + if (cfd >= 0) { + recursiveRemove(cfd); + close(cfd); + } + isdir = 1; + } + } + + if (unlinkat(dfd, d->d_name, isdir ? AT_REMOVEDIR : 0)) + warn(_("failed to unlink %s"), d->d_name); + } + + rc = 0; /* success */ + +done: + if (dir) + closedir(dir); + return rc; +} + +static int switchroot(const char *newroot) +{ + /* Don't try to unmount the old "/", there's no way to do it. */ + const char *umounts[] = { "/dev", "/proc", "/sys", "/run", NULL }; + int i; + int cfd; + pid_t pid; + struct stat newroot_stat, sb; + + if (stat(newroot, &newroot_stat) != 0) { + warn(_("stat of %s failed"), newroot); + return -1; + } + + for (i = 0; umounts[i] != NULL; i++) { + char newmount[PATH_MAX]; + + snprintf(newmount, sizeof(newmount), "%s%s", newroot, umounts[i]); + + if ((stat(newmount, &sb) != 0) || (sb.st_dev != newroot_stat.st_dev)) { + /* mount point seems to be mounted already or stat failed */ + umount2(umounts[i], MNT_DETACH); + continue; + } + + if (mount(umounts[i], newmount, NULL, MS_MOVE, NULL) < 0) { + warn(_("failed to mount moving %s to %s"), + umounts[i], newmount); + warnx(_("forcing unmount of %s"), umounts[i]); + umount2(umounts[i], MNT_FORCE); + } + } + + if (chdir(newroot)) { + warn(_("failed to change directory to %s"), newroot); + return -1; + } + + cfd = open("/", O_RDONLY); + if (cfd < 0) { + warn(_("cannot open %s"), "/"); + return -1; + } + + if (mount(newroot, "/", NULL, MS_MOVE, NULL) < 0) { + close(cfd); + warn(_("failed to mount moving %s to /"), newroot); + return -1; + } + + if (chroot(".")) { + close(cfd); + warn(_("failed to change root")); + return -1; + } + + pid = fork(); + if (pid <= 0) { + struct statfs stfs; + + if (fstatfs(cfd, &stfs) == 0 && + (F_TYPE_EQUAL(stfs.f_type, STATFS_RAMFS_MAGIC) || + F_TYPE_EQUAL(stfs.f_type, STATFS_TMPFS_MAGIC))) + recursiveRemove(cfd); + else + warn(_("old root filesystem is not an initramfs")); + if (pid == 0) + exit(EXIT_SUCCESS); + } + + close(cfd); + return 0; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *output = stdout; + fputs(USAGE_HEADER, output); + fprintf(output, _(" %s [options] <newrootdir> <init> <args to init>\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, output); + fputs(_("Switch to another filesystem as the root of the mount tree.\n"), output); + + fputs(USAGE_OPTIONS, output); + printf(USAGE_HELP_OPTIONS(16)); + printf(USAGE_MAN_TAIL("switch_root(8)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + char *newroot, *init, **initargs; + int c; + static const struct option longopts[] = { + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "+Vh", longopts, NULL)) != -1) + switch (c) { + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + if (argc < 3) { + warnx(_("not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + + newroot = argv[1]; + init = argv[2]; + initargs = &argv[2]; + + if (!*newroot || !*init) { + warnx(_("bad usage")); + errtryhelp(EXIT_FAILURE); + } + + if (switchroot(newroot)) + errx(EXIT_FAILURE, _("failed. Sorry.")); + + if (access(init, X_OK)) + warn(_("cannot access %s"), init); + + execv(init, initargs); + errexec(init); +} + diff --git a/sys-utils/tunelp.8 b/sys-utils/tunelp.8 new file mode 100644 index 0000000..90db834 --- /dev/null +++ b/sys-utils/tunelp.8 @@ -0,0 +1,122 @@ +.\" Copyright (C) 1992-1997 Michael K. Johnson <johnsonm@redhat.com> +.\" Copyright (C) 1998 Andrea Arcangeli <andrea@e-mind.com> +.\" It may be distributed under the terms of the GNU General Public License, +.\" version 2, or any higher version. See section COPYING of the GNU General +.\" Public license for conditions under which this file may be redistributed. +.\" +.TH TUNELP 8 "October 2011" "util-linux" "System Administration" +.SH NAME +tunelp \- set various parameters for the lp device +.SH SYNOPSIS +.B tunelp +[options] +.I device +.SH DESCRIPTION +\fBtunelp\fP sets several parameters for the /dev/lp\fI?\fP devices, for +better performance (or for any performance at all, if your printer won't work +without it...) Without parameters, it tells whether the device is using +interrupts, and if so, which one. With parameters, it sets the device +characteristics accordingly. +.SH OPTIONS +.TP +\fB\-i\fR, \fB\-\-irq\fR \fIargument\fR +specifies the IRQ to use for the parallel port in question. If this is set +to something non-zero, \-t and \-c have no effect. If your port does not use +interrupts, this option will make printing stop. The command +.B tunelp -i 0 +restores non-interrupt driven (polling) action, and your printer should work +again. If your parallel port does support interrupts, interrupt-driven +printing should be somewhat faster and efficient, and will probably be +desirable. +.IP +NOTE: This option will have no effect with kernel 2.1.131 or later since the +irq is handled by the parport driver. You can change the parport irq for +example via +.IR /proc/parport/*/irq . +Read +.I /usr/src/linux/Documentation/parport.txt +for more details on parport. +.TP +\fB\-t\fR, \fB\-\-time\fR \fImilliseconds\fR +is the amount of time in jiffies that the driver waits if the printer doesn't +take a character for the number of tries dictated by the \-c parameter. 10 +is the default value. If you want fastest possible printing, and don't care +about system load, you may set this to 0. If you don't care how fast your +printer goes, or are printing text on a slow printer with a buffer, then 500 +(5 seconds) should be fine, and will give you very low system load. This +value generally should be lower for printing graphics than text, by a factor +of approximately 10, for best performance. +.TP +\fB\-c\fR, \fB\-\-chars\fR \fIcharacters\fR +is the number of times to try to output a character to the printer before +sleeping for \-t \fITIME\fP. It is the number of times around a loop that +tries to send a character to the printer. 120 appears to be a good value for +most printers in polling mode. 1000 is the default, because there are some +printers that become jerky otherwise, but you \fImust\fP set this to `1' to +handle the maximal CPU efficiency if you are using interrupts. If you have a +very fast printer, a value of 10 might make more sense even if in polling +mode. If you have a \fIreally\fP old printer, you can increase this further. +.IP +Setting \-t \fITIME\fP to 0 is equivalent to setting \-c \fICHARS\fP to +infinity. +.TP +\fB\-w\fR, \fB\-\-wait\fR \fImilliseconds\fR +is the number of usec we wait while playing with the strobe signal. While +most printers appear to be able to deal with an extremely short strobe, some +printers demand a longer one. Increasing this from the default 1 may make it +possible to print with those printers. This may also make it possible to use +longer cables. It's also possible to decrease this value to 0 if your +printer is fast enough or your machine is slow enough. +.TP +\fB\-a\fR, \fB\-\-abort\fR \fI<on|off>\fR +This is whether to abort on printer error - the default is not to. If you +are sitting at your computer, you probably want to be able to see an error +and fix it, and have the printer go on printing. On the other hand, if you +aren't, you might rather that your printer spooler find out that the printer +isn't ready, quit trying, and send you mail about it. The choice is yours. +.TP +\fB\-o\fR, \fB\-\-check\-status\fR \fI<on|off>\fR +This option is much like \-a. It makes any +.BR open (2) +of this device check to see that the device is on-line and not reporting any +out of paper or other errors. This is the correct setting for most versions +of lpd. +.TP +\fB\-C\fR, \fB\-\-careful\fR \fI<on|off>\fR +This option adds extra ("careful") error checking. When this option is on, +the printer driver will ensure that the printer is on-line and not reporting +any out of paper or other errors before sending data. This is particularly +useful for printers that normally appear to accept data when turned off. +.IP +NOTE: This option is obsolete because it's the default in 2.1.131 kernel or +later. +.TP +\fB\-s\fR, \fB\-\-status\fR +This option returns the current printer status, both as a decimal number from +0..255, and as a list of active flags. When this option is specified, \-q +off, turning off the display of the current IRQ, is implied. +.TP +\fB\-r\fR, \fB\-\-reset\fR +This option resets the port. It requires a Linux kernel version of 1.1.80 or +later. +.TP +\fB\-q\fR, \fB\-\-print\-irq\fR \fI<on|off>\fR +This option sets printing the display of the current IRQ setting. +.SH NOTES +.BR \-o , +.BR \-C , +and +.B \-s +all require a Linux kernel version of 1.1.76 or later. +.PP +.B \-C +requires a Linux version prior to 2.1.131. +.SH FILES +.I /dev/lp? +.br +.I /proc/parport/*/* +.SH AVAILABILITY +The tunelp command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/tunelp.c b/sys-utils/tunelp.c new file mode 100644 index 0000000..fe261f3 --- /dev/null +++ b/sys-utils/tunelp.c @@ -0,0 +1,321 @@ +/* + * Copyright (C) 1992-1997 Michael K. Johnson, johnsonm@redhat.com + * + * This file is licensed under the terms of the GNU General Public + * License, version 2, or any later version. See file COPYING for + * information on distribution conditions. + */ + +/* + * This command is deprecated. The utility is in maintenance mode, + * meaning we keep them in source tree for backward compatibility + * only. Do not waste time making this command better, unless the + * fix is about security or other very critical issue. + * + * See Documentation/deprecated.txt for more information. + */ + +/* + * $Log: tunelp.c,v $ + * Revision 1.9 1998/06/08 19:37:11 janl + * Thus compiles tunelp with 2.1.103 kernels + * + * Revision 1.8 1997/07/06 00:14:06 aebr + * Fixes to silence -Wall. + * + * Revision 1.7 1997/06/20 16:10:38 janl + * tunelp refreshed from authors archive. + * + * Revision 1.9 1997/06/20 12:56:43 johnsonm + * Finished fixing license terms. + * + * Revision 1.8 1997/06/20 12:34:59 johnsonm + * Fixed copyright and license. + * + * Revision 1.7 1995/03/29 11:16:23 johnsonm + * TYPO fixed... + * + * Revision 1.6 1995/03/29 11:12:15 johnsonm + * Added third argument to ioctl needed with new kernels + * + * Revision 1.5 1995/01/13 10:33:43 johnsonm + * Chris's changes for new ioctl numbers and backwards compatibility + * and the reset ioctl. + * + * Revision 1.4 1995/01/03 17:42:14 johnsonm + * -s isn't supposed to take an argument; removed : after s in getopt... + * + * Revision 1.3 1995/01/03 07:36:49 johnsonm + * Fixed typo + * + * Revision 1.2 1995/01/03 07:33:44 johnsonm + * revisions for lp driver updates in Linux 1.1.76 + * + * 1999-02-22 Arkadiusz Miśkiewicz <misiek@pld.ORG.PL> + * - added Native Language Support + * + * 1999-05-07 Merged LPTRUSTIRQ patch by Andrea Arcangeli (1998/11/29), aeb + * + */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/lp.h> + +#include "nls.h" +#include "closestream.h" +#include "strutils.h" + +#define EXIT_LP_MALLOC 2 +#define EXIT_LP_BADVAL 3 +#define EXIT_LP_IO_ERR 4 + +#define XALLOC_EXIT_CODE EXIT_LP_MALLOC +#include "xalloc.h" + +struct command { + long op; + long val; + struct command *next; +}; + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] <device>\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Set various parameters for the line printer.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -i, --irq <num> specify parallel port irq\n"), out); + fputs(_(" -t, --time <ms> driver wait time in milliseconds\n"), out); + fputs(_(" -c, --chars <num> number of output characters before sleep\n"), out); + fputs(_(" -w, --wait <us> strobe wait in micro seconds\n"), out); + /* TRANSLATORS: do not translate <on|off> arguments. The + argument reader does not recognize locale, unless `on' is + exactly that very same string. */ + fputs(_(" -a, --abort <on|off> abort on error\n"), out); + fputs(_(" -o, --check-status <on|off> check printer status before printing\n"), out); + fputs(_(" -C, --careful <on|off> extra checking to status check\n"), out); + fputs(_(" -s, --status query printer status\n"), out); + fputs(_(" -r, --reset reset the port\n"), out); + fputs(_(" -q, --print-irq <on|off> display current irq setting\n"), out); + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(30)); + printf(USAGE_MAN_TAIL("tunelp(8)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + int c, fd, irq, status, show_irq, offset = 0, retval; + char *filename; + struct stat statbuf; + struct command *cmds, *cmdst; + static const struct option longopts[] = { + {"irq", required_argument, NULL, 'i'}, + {"time", required_argument, NULL, 't'}, + {"chars", required_argument, NULL, 'c'}, + {"wait", required_argument, NULL, 'w'}, + {"abort", required_argument, NULL, 'a'}, + {"check-status", required_argument, NULL, 'o'}, + {"careful", required_argument, NULL, 'C'}, + {"status", no_argument, NULL, 's'}, + {"trust-irq", required_argument, NULL, 'T'}, + {"reset", no_argument, NULL, 'r'}, + {"print-irq", required_argument, NULL, 'q'}, + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + strutils_set_exitcode(EXIT_LP_BADVAL); + + if (argc < 2) { + warnx(_("not enough arguments")); + errtryhelp(EXIT_FAILURE); + } + + cmdst = cmds = xmalloc(sizeof(struct command)); + cmds->next = NULL; + + show_irq = 1; + while ((c = getopt_long(argc, argv, "t:c:w:a:i:ho:C:sq:rT:vV", longopts, NULL)) != -1) { + switch (c) { + case 'h': + usage(); + break; + case 'i': + cmds->op = LPSETIRQ; + cmds->val = strtol_or_err(optarg, _("argument error")); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 't': + cmds->op = LPTIME; + cmds->val = strtol_or_err(optarg, _("argument error")); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'c': + cmds->op = LPCHAR; + cmds->val = strtol_or_err(optarg, _("argument error")); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'w': + cmds->op = LPWAIT; + cmds->val = strtol_or_err(optarg, _("argument error")); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'a': + cmds->op = LPABORT; + cmds->val = parse_switch(optarg, _("argument error"), "on", "off", NULL); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'q': + show_irq = parse_switch(optarg, _("argument error"), "on", "off", NULL); + break; + case 'o': + cmds->op = LPABORTOPEN; + cmds->val = parse_switch(optarg, _("argument error"), "on", "off", NULL); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'C': + cmds->op = LPCAREFUL; + cmds->val = parse_switch(optarg, _("argument error"), "on", "off", NULL); + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 's': + show_irq = 0; + cmds->op = LPGETSTATUS; + cmds->val = 0; + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'r': + cmds->op = LPRESET; + cmds->val = 0; + cmds->next = xmalloc(sizeof(struct command)); + cmds = cmds->next; + cmds->next = NULL; + break; + case 'v': + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (optind != argc - 1) { + warnx(_("no device specified")); + errtryhelp(EXIT_FAILURE); + } + + filename = xstrdup(argv[optind]); + fd = open(filename, O_WRONLY | O_NONBLOCK, 0); + /* Need to open O_NONBLOCK in case ABORTOPEN is already set + * and printer is off or off-line or in an error condition. + * Otherwise we would abort... + */ + if (fd < 0) + err(EXIT_FAILURE, "%s", filename); + + if (fstat(fd, &statbuf)) + err(EXIT_FAILURE, "%s: stat() failed", filename); + + if (!S_ISCHR(statbuf.st_mode)) { + warnx(_("%s not an lp device"), filename); + errtryhelp(EXIT_FAILURE); + } + /* Allow for binaries compiled under a new kernel to work on + * the old ones The irq argument to ioctl isn't touched by + * the old kernels, but we don't want to cause the kernel to + * complain if we are using a new kernel + */ + if (LPGETIRQ >= 0x0600 && ioctl(fd, LPGETIRQ, &irq) < 0 + && errno == EINVAL) + /* We don't understand the new ioctls */ + offset = 0x0600; + + cmds = cmdst; + while (cmds->next) { + if (cmds->op == LPGETSTATUS) { + status = 0xdeadbeef; + retval = ioctl(fd, LPGETSTATUS - offset, &status); + if (retval < 0) + warnx(_("LPGETSTATUS error")); + else { + if (status == (int)0xdeadbeef) + /* a few 1.1.7x kernels will do this */ + status = retval; + printf(_("%s status is %d"), filename, status); + if (!(status & LP_PBUSY)) + printf(_(", busy")); + if (!(status & LP_PACK)) + printf(_(", ready")); + if ((status & LP_POUTPA)) + printf(_(", out of paper")); + if ((status & LP_PSELECD)) + printf(_(", on-line")); + if (!(status & LP_PERRORP)) + printf(_(", error")); + printf("\n"); + } + } else + if (ioctl(fd, cmds->op - offset, cmds->val) < 0) + warn(_("ioctl failed")); + cmdst = cmds; + cmds = cmds->next; + free(cmdst); + } + + if (show_irq) { + irq = 0xdeadbeef; + retval = ioctl(fd, LPGETIRQ - offset, &irq); + if (retval == -1) + err(EXIT_LP_IO_ERR, _("LPGETIRQ error")); + if (irq == (int)0xdeadbeef) + /* up to 1.1.77 will do this */ + irq = retval; + if (irq) + printf(_("%s using IRQ %d\n"), filename, irq); + else + printf(_("%s using polling\n"), filename); + } + free(filename); + close(fd); + + return EXIT_SUCCESS; +} diff --git a/sys-utils/umount.8 b/sys-utils/umount.8 new file mode 100644 index 0000000..f94d2f4 --- /dev/null +++ b/sys-utils/umount.8 @@ -0,0 +1,267 @@ +.\" Copyright (c) 1996 Andries Brouwer +.\" This page is somewhat derived from a page that was +.\" (c) 1980, 1989, 1991 The Regents of the University of California +.\" and had been heavily modified by Rik Faith and myself. +.\" +.\" This is free documentation; you can redistribute it and/or +.\" modify it under the terms of the GNU General Public License as +.\" published by the Free Software Foundation; either version 2 of +.\" the License, or (at your option) any later version. +.\" +.\" The GNU General Public License's references to "object code" +.\" and "executables" are to be interpreted as the output of any +.\" document formatting or typesetting system, including +.\" intermediate and printed output. +.\" +.\" This manual is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License along +.\" with this program; if not, write to the Free Software Foundation, Inc., +.\" 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +.\" +.TH UMOUNT 8 "July 2014" "util-linux" "System Administration" +.SH NAME +umount \- unmount file systems +.SH SYNOPSIS +.B umount \-a +.RB [ \-dflnrv ] +.RB [ \-t +.IR fstype ] +.RB [ \-O +.IR option ...] +.sp +.B umount +.RB [ \-dflnrv ] +.RI { directory | device }... +.sp +.B umount +.BR \-h | \-V + +.SH DESCRIPTION +The +.B umount +command detaches the mentioned file system(s) from the file hierarchy. A +file system is specified by giving the directory where it has been +mounted. Giving the special device on which the file system lives may +also work, but is obsolete, mainly because it will fail in case this +device was mounted on more than one directory. +.PP +Note that a file system cannot be unmounted when it is 'busy' - for +example, when there are open files on it, or when some process has its +working directory there, or when a swap file on it is in use. The +offending process could even be +.B umount +itself - it opens libc, and libc in its turn may open for example locale +files. A lazy unmount avoids this problem, but it may introduce another +issues. See \fB\-\-lazy\fR description below. +.SH OPTIONS +.TP +.BR \-a , " \-\-all" +All of the filesystems described in +.I /proc/self/mountinfo +(or in deprecated /etc/mtab) +are unmounted, except the proc, devfs, devpts, sysfs, rpc_pipefs and nfsd +filesystems. This list of the filesystems may be replaced by \fB\-\-types\fR +umount option. +.TP +.BR \-A , " \-\-all\-targets" +Unmount all mountpoints in the current namespace for the specified filesystem. +The filesystem can be specified by one of the mountpoints or the device name (or +UUID, etc.). When this option is used together with \fB\-\-recursive\fR, then +all nested mounts within the filesystem are recursively unmounted. +This option is only supported on systems where /etc/mtab is a symlink +to /proc/mounts. +.TP +.BR \-c , " \-\-no\-canonicalize" +Do not canonicalize paths. The paths canonicalization is based on +.BR stat (2) +and +.BR readlink (2) +system calls. These system calls may hang in some cases (for example on NFS if +server is not available). The option has to be used with canonical path to the +mount point. + +For more details about this option see the +.BR mount (8) +man page. Note that \fBumount\fR does not pass this option to the +.BI /sbin/umount. type +helpers. +.TP +.BR \-d , " \-\-detach\-loop" +When the unmounted device was a loop device, also free this loop +device. This option is unnecessary for devices initialized by +.BR mount (8), +in this case "autoclear" functionality is enabled by default. +.TP +.B \-\-fake +Causes everything to be done except for the actual system call or umount helper +execution; this 'fakes' unmounting the filesystem. It can be used to remove +entries from the deprecated +.I /etc/mtab +that were unmounted earlier with the +.B \-n +option. +.TP +.BR \-f , " \-\-force" +Force an unmount (in case of an unreachable NFS system). + +Note that this option does not guarantee that umount command does not hang. +It's strongly recommended to use absolute paths without symlinks to avoid +unwanted readlink and stat system calls on unreachable NFS in umount. +.TP +.BR \-i , " \-\-internal\-only" +Do not call the \fB/sbin/umount.\fIfilesystem\fR helper even if it exists. +By default such a helper program is called if it exists. +.TP +.BR \-l , " \-\-lazy" +Lazy unmount. Detach the filesystem from the file hierarchy now, +and clean up all references to this filesystem as soon as it is not busy +anymore. + +A system reboot would be expected in near future if you're going to use this +option for network filesystem or local filesystem with submounts. The +recommended use-case for \fBumount -l\fR is to prevent hangs on shutdown due to +an unreachable network share where a normal umount will hang due to a downed +server or a network partition. Remounts of the share will not be possible. + +.TP +.BR \-N , " \-\-namespace " \fIns +Perform umount in namespace specified by \fIns\fR. +\fIns\fR is either PID of process running in that namespace +or special file representing that namespace. +.sp +.BR umount (8) +switches to the namespace when it reads /etc/fstab, writes /etc/mtab (or writes to /run/mount) and calls +.BR umount (2) +system call, otherwise it runs in the original namespace. It means that the target namespace does not have +to contain any libraries or another requirements necessary to execute +.BR umount (2) +command. +.sp +See \fBnamespaces\fR(7) for more information. +.TP +.BR \-n , " \-\-no\-mtab" +Unmount without writing in +.IR /etc/mtab . +.TP +.BR \-O , " \-\-test\-opts " \fIoption\fR... +Unmount only the filesystems that have the specified option set in +.IR /etc/fstab . +More than one option may be specified in a comma-separated list. +Each option can be prefixed with +.B no +to indicate that no action should be taken for this option. +.TP +.BR \-q , " \-\-quiet" +Suppress "not mounted" error messages. +.TP +.BR \-R , " \-\-recursive" +Recursively unmount each specified directory. Recursion for each directory will +stop if any unmount operation in the chain fails for any reason. The relationship +between mountpoints is determined by /proc/self/mountinfo entries. The filesystem +must be specified by mountpoint path; a recursive unmount by device name (or UUID) +is unsupported. +.TP +.BR \-r , " \-\-read\-only" +When an unmount fails, try to remount the filesystem read-only. +.TP +.BR \-t , " \-\-types " \fItype\fR... +Indicate that the actions should only be taken on filesystems of the +specified +.IR type . +More than one type may be specified in a comma-separated list. The list +of filesystem types can be prefixed with +.B no +to indicate that no action should be taken for all of the mentioned types. +Note that +.B umount +reads information about mounted filesystems from kernel (/proc/mounts) and +filesystem names may be different than filesystem names used in the /etc/fstab +(e.g. "nfs4" vs. "nfs"). +.TP +.BR \-v , " \-\-verbose" +Verbose mode. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH "LOOP DEVICE" +The +.B umount +command will automatically detach loop device previously initialized by +.BR mount (8) +command independently of /etc/mtab. + +In this case the device is initialized with "autoclear" flag (see +.BR losetup (8) +output for more details), otherwise it's necessary to use the option \fB \-\-detach\-loop\fR +or call \fBlosetup -d <device>\fR. The autoclear feature is supported since Linux 2.6.25. +.SH EXTERNAL HELPERS +The syntax of external unmount helpers is: +.PP +.RS +.BI umount. suffix +.RI { directory | device } +.RB [ \-flnrv ] +.RB [ \-N +.IR namespace ] +.RB [ \-t +.IR type . subtype ] +.RE +.PP +where \fIsuffix\fR is the filesystem type (or the value from a +\fBuhelper=\fR or \fBhelper=\fR marker in the mtab file). +The \fB\-t\fR option can be used for filesystems that +have subtype support. For example: +.PP +.RS +.B umount.fuse \-t fuse.sshfs +.RE +.PP +A \fBuhelper=\fIsomething\fR marker (unprivileged helper) can appear in +the \fI/etc/mtab\fR file when ordinary users need to be able to unmount +a mountpoint that is not defined in \fI/etc/fstab\fR +(for example for a device that was mounted by \fBudisks\fR(1)). +.PP +A \fBhelper=\fItype\fR marker in the mtab file will redirect +all unmount requests +to the \fB/sbin/umount.\fItype\fR helper independently of UID. +.PP +Note that \fI/etc/mtab\fR is currently deprecated and helper= and another +userspace mount options are maintained by libmount. +.SH FILES +.TP +.I /etc/mtab +table of mounted filesystems (deprecated and usually replaced by +symlink to /proc/mounts) +.TP +.I /etc/fstab +table of known filesystems +.TP +.I /proc/self/mountinfo +table of mounted filesystems generated by kernel. +.SH ENVIRONMENT +.IP LIBMOUNT_FSTAB=<path> +overrides the default location of the fstab file (ignored for suid) +.IP LIBMOUNT_MTAB=<path> +overrides the default location of the mtab file (ignored for suid) +.IP LIBMOUNT_DEBUG=all +enables libmount debug output +.SH "SEE ALSO" +.BR umount (2), +.BR losetup (8), +.BR mount (8) +.SH HISTORY +A +.B umount +command appeared in Version 6 AT&T UNIX. +.SH AVAILABILITY +The umount command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/umount.c b/sys-utils/umount.c new file mode 100644 index 0000000..b021088 --- /dev/null +++ b/sys-utils/umount.c @@ -0,0 +1,610 @@ +/* + * umount(8) -- mount a filesystem + * + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Written by Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <getopt.h> +#include <unistd.h> +#include <sys/types.h> + +#include <libmount.h> + +#include "nls.h" +#include "c.h" +#include "env.h" +#include "closestream.h" +#include "pathnames.h" +#include "canonicalize.h" + +#define XALLOC_EXIT_CODE MNT_EX_SYSERR +#include "xalloc.h" + +#define OPTUTILS_EXIT_CODE MNT_EX_USAGE +#include "optutils.h" + +static int quiet; + +static int table_parser_errcb(struct libmnt_table *tb __attribute__((__unused__)), + const char *filename, int line) +{ + if (filename) + warnx(_("%s: parse error at line %d -- ignored"), filename, line); + return 1; +} + + +static void __attribute__((__noreturn__)) print_version(void) +{ + const char *ver = NULL; + const char **features = NULL, **p; + + mnt_get_library_version(&ver); + mnt_get_library_features(&features); + + printf(_("%s from %s (libmount %s"), + program_invocation_short_name, + PACKAGE_STRING, + ver); + p = features; + while (p && *p) { + fputs(p == features ? ": " : ", ", stdout); + fputs(*p++, stdout); + } + fputs(")\n", stdout); + exit(MNT_EX_SUCCESS); +} +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + fputs(USAGE_HEADER, out); + fprintf(out, _( + " %1$s [-hV]\n" + " %1$s -a [options]\n" + " %1$s [options] <source> | <directory>\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Unmount filesystems.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --all unmount all filesystems\n"), out); + fputs(_(" -A, --all-targets unmount all mountpoints for the given device in the\n" + " current namespace\n"), out); + fputs(_(" -c, --no-canonicalize don't canonicalize paths\n"), out); + fputs(_(" -d, --detach-loop if mounted loop device, also free this loop device\n"), out); + fputs(_(" --fake dry run; skip the umount(2) syscall\n"), out); + fputs(_(" -f, --force force unmount (in case of an unreachable NFS system)\n"), out); + fputs(_(" -i, --internal-only don't call the umount.<type> helpers\n"), out); + fputs(_(" -n, --no-mtab don't write to /etc/mtab\n"), out); + fputs(_(" -l, --lazy detach the filesystem now, clean up things later\n"), out); + fputs(_(" -O, --test-opts <list> limit the set of filesystems (use with -a)\n"), out); + fputs(_(" -R, --recursive recursively unmount a target with all its children\n"), out); + fputs(_(" -r, --read-only in case unmounting fails, try to remount read-only\n"), out); + fputs(_(" -t, --types <list> limit the set of filesystem types\n"), out); + fputs(_(" -v, --verbose say what is being done\n"), out); + fputs(_(" -q, --quiet suppress 'not mounted' error messages\n"), out); + fputs(_(" -N, --namespace <ns> perform umount in another namespace\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(25)); + printf(USAGE_MAN_TAIL("umount(8)")); + + exit(MNT_EX_SUCCESS); +} + +static void __attribute__((__noreturn__)) exit_non_root(const char *option) +{ + const uid_t ruid = getuid(); + const uid_t euid = geteuid(); + + if (ruid == 0 && euid != 0) { + /* user is root, but setuid to non-root */ + if (option) + errx(MNT_EX_USAGE, + _("only root can use \"--%s\" option " + "(effective UID is %u)"), + option, euid); + errx(MNT_EX_USAGE, _("only root can do that " + "(effective UID is %u)"), euid); + } + if (option) + errx(MNT_EX_USAGE, _("only root can use \"--%s\" option"), option); + errx(MNT_EX_USAGE, _("only root can do that")); +} + +static void success_message(struct libmnt_context *cxt) +{ + const char *tgt, *src; + + if (mnt_context_helper_executed(cxt) + || mnt_context_get_status(cxt) != 1) + return; + + tgt = mnt_context_get_target(cxt); + if (!tgt) + return; + + src = mnt_context_get_source(cxt); + if (src) + warnx(_("%s (%s) unmounted"), tgt, src); + else + warnx(_("%s unmounted"), tgt); +} + +static int mk_exit_code(struct libmnt_context *cxt, int rc) +{ + char buf[BUFSIZ] = { 0 }; + + rc = mnt_context_get_excode(cxt, rc, buf, sizeof(buf)); + + /* suppress "not mounted" error message */ + if (quiet && + rc == MNT_EX_FAIL && + mnt_context_syscall_called(cxt) && + mnt_context_get_syscall_errno(cxt) == EINVAL) + return rc; + + /* print errors/warnings */ + if (*buf) { + const char *spec = mnt_context_get_target(cxt); + if (!spec) + spec = mnt_context_get_source(cxt); + if (!spec) + spec = "???"; + warnx("%s: %s.", spec, buf); + } + return rc; +} + +static int umount_all(struct libmnt_context *cxt) +{ + struct libmnt_iter *itr; + struct libmnt_fs *fs; + int mntrc, ignored, rc = 0; + + itr = mnt_new_iter(MNT_ITER_BACKWARD); + if (!itr) { + warn(_("failed to initialize libmount iterator")); + return MNT_EX_SYSERR; + } + + while (mnt_context_next_umount(cxt, itr, &fs, &mntrc, &ignored) == 0) { + + const char *tgt = mnt_fs_get_target(fs); + + if (ignored) { + if (mnt_context_is_verbose(cxt)) + printf(_("%-25s: ignored\n"), tgt); + } else { + int xrc = mk_exit_code(cxt, mntrc); + + if (xrc == MNT_EX_SUCCESS + && mnt_context_is_verbose(cxt)) + printf("%-25s: successfully unmounted\n", tgt); + rc |= xrc; + } + } + + mnt_free_iter(itr); + return rc; +} + +static int umount_one(struct libmnt_context *cxt, const char *spec) +{ + int rc; + + if (!spec) + return MNT_EX_SOFTWARE; + + if (mnt_context_set_target(cxt, spec)) + err(MNT_EX_SYSERR, _("failed to set umount target")); + + rc = mnt_context_umount(cxt); + rc = mk_exit_code(cxt, rc); + + if (rc == MNT_EX_SUCCESS && mnt_context_is_verbose(cxt)) + success_message(cxt); + + mnt_reset_context(cxt); + return rc; +} + +static struct libmnt_table *new_mountinfo(struct libmnt_context *cxt) +{ + struct libmnt_table *tb; + struct libmnt_ns *ns_old = mnt_context_switch_target_ns(cxt); + + if (!ns_old) + err(MNT_EX_SYSERR, _("failed to switch namespace")); + + tb = mnt_new_table(); + if (!tb) + err(MNT_EX_SYSERR, _("libmount table allocation failed")); + + mnt_table_set_parser_errcb(tb, table_parser_errcb); + mnt_table_set_cache(tb, mnt_context_get_cache(cxt)); + + if (mnt_table_parse_file(tb, _PATH_PROC_MOUNTINFO)) { + warn(_("failed to parse %s"), _PATH_PROC_MOUNTINFO); + mnt_unref_table(tb); + tb = NULL; + } + + if (!mnt_context_switch_ns(cxt, ns_old)) + err(MNT_EX_SYSERR, _("failed to switch namespace")); + + return tb; +} + +/* + * like umount_one() but does not return error is @spec not mounted + */ +static int umount_one_if_mounted(struct libmnt_context *cxt, const char *spec) +{ + int rc; + struct libmnt_fs *fs; + + rc = mnt_context_find_umount_fs(cxt, spec, &fs); + if (rc == 1) { + rc = MNT_EX_SUCCESS; /* already unmounted */ + mnt_reset_context(cxt); + } else if (rc < 0) { + rc = mk_exit_code(cxt, rc); /* error */ + mnt_reset_context(cxt); + } else + rc = umount_one(cxt, mnt_fs_get_target(fs)); + + return rc; +} + +static int umount_do_recurse(struct libmnt_context *cxt, + struct libmnt_table *tb, struct libmnt_fs *fs) +{ + struct libmnt_fs *child; + struct libmnt_iter *itr = mnt_new_iter(MNT_ITER_BACKWARD); + int rc; + + if (!itr) + err(MNT_EX_SYSERR, _("libmount iterator allocation failed")); + + /* umount all children */ + for (;;) { + rc = mnt_table_next_child_fs(tb, itr, fs, &child); + if (rc < 0) { + warnx(_("failed to get child fs of %s"), + mnt_fs_get_target(fs)); + rc = MNT_EX_SOFTWARE; + goto done; + } else if (rc == 1) + break; /* no more children */ + + rc = umount_do_recurse(cxt, tb, child); + if (rc != MNT_EX_SUCCESS) + goto done; + } + + rc = umount_one_if_mounted(cxt, mnt_fs_get_target(fs)); +done: + mnt_free_iter(itr); + return rc; +} + +static int umount_recursive(struct libmnt_context *cxt, const char *spec) +{ + struct libmnt_table *tb; + struct libmnt_fs *fs; + int rc; + + tb = new_mountinfo(cxt); + if (!tb) + return MNT_EX_SOFTWARE; + + /* it's always real mountpoint, don't assume that the target maybe a device */ + mnt_context_disable_swapmatch(cxt, 1); + + fs = mnt_table_find_target(tb, spec, MNT_ITER_BACKWARD); + if (fs) + rc = umount_do_recurse(cxt, tb, fs); + else { + rc = MNT_EX_USAGE; + if (!quiet) + warnx(access(spec, F_OK) == 0 ? + _("%s: not mounted") : + _("%s: not found"), spec); + } + + mnt_unref_table(tb); + return rc; +} + +static int umount_alltargets(struct libmnt_context *cxt, const char *spec, int rec) +{ + struct libmnt_fs *fs; + struct libmnt_table *tb; + struct libmnt_iter *itr = NULL; + dev_t devno = 0; + int rc; + + /* Convert @spec to device name, Use the same logic like regular + * "umount <spec>". + */ + rc = mnt_context_find_umount_fs(cxt, spec, &fs); + if (rc == 1) { + rc = MNT_EX_USAGE; + if (!quiet) + warnx(access(spec, F_OK) == 0 ? + _("%s: not mounted") : + _("%s: not found"), spec); + return rc; + } + if (rc < 0) + return mk_exit_code(cxt, rc); /* error */ + + if (!mnt_fs_get_srcpath(fs) || !mnt_fs_get_devno(fs)) + errx(MNT_EX_USAGE, _("%s: failed to determine source " + "(--all-targets is unsupported on systems with " + "regular mtab file)."), spec); + + itr = mnt_new_iter(MNT_ITER_BACKWARD); + if (!itr) + err(MNT_EX_SYSERR, _("libmount iterator allocation failed")); + + /* get on @cxt independent mountinfo */ + tb = new_mountinfo(cxt); + if (!tb) { + rc = MNT_EX_SOFTWARE; + goto done; + } + + /* Note that @fs is from mount context and the context will be reset + * after each umount() call */ + devno = mnt_fs_get_devno(fs); + fs = NULL; + + mnt_reset_context(cxt); + + while (mnt_table_next_fs(tb, itr, &fs) == 0) { + if (mnt_fs_get_devno(fs) != devno) + continue; + mnt_context_disable_swapmatch(cxt, 1); + if (rec) + rc = umount_do_recurse(cxt, tb, fs); + else + rc = umount_one_if_mounted(cxt, mnt_fs_get_target(fs)); + + if (rc != MNT_EX_SUCCESS) + break; + } + +done: + mnt_free_iter(itr); + mnt_unref_table(tb); + + return rc; +} + +/* + * Check path -- non-root user should not be able to resolve path which is + * unreadable for him. + */ +static char *sanitize_path(const char *path) +{ + char *p; + + if (!path) + return NULL; + + p = canonicalize_path_restricted(path); + if (!p) + err(MNT_EX_USAGE, "%s", path); + + return p; +} + +static pid_t parse_pid(const char *str) +{ + char *end; + pid_t ret; + + errno = 0; + ret = strtoul(str, &end, 10); + + if (ret < 0 || errno || end == str || (end && *end)) + return 0; + return ret; +} + +int main(int argc, char **argv) +{ + int c, rc = 0, all = 0, recursive = 0, alltargets = 0; + struct libmnt_context *cxt; + char *types = NULL; + + enum { + UMOUNT_OPT_FAKE = CHAR_MAX + 1, + }; + + static const struct option longopts[] = { + { "all", no_argument, NULL, 'a' }, + { "all-targets", no_argument, NULL, 'A' }, + { "detach-loop", no_argument, NULL, 'd' }, + { "fake", no_argument, NULL, UMOUNT_OPT_FAKE }, + { "force", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { "internal-only", no_argument, NULL, 'i' }, + { "lazy", no_argument, NULL, 'l' }, + { "no-canonicalize", no_argument, NULL, 'c' }, + { "no-mtab", no_argument, NULL, 'n' }, + { "quiet", no_argument, NULL, 'q' }, + { "read-only", no_argument, NULL, 'r' }, + { "recursive", no_argument, NULL, 'R' }, + { "test-opts", required_argument, NULL, 'O' }, + { "types", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + { "namespace", required_argument, NULL, 'N' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'A','a' }, /* all-targets,all */ + { 'R','a' }, /* recursive,all */ + { 'O','R','t'}, /* options,recursive,types */ + { 'R','r' }, /* recursive,read-only */ + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + sanitize_env(); + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + mnt_init_debug(0); + cxt = mnt_new_context(); + if (!cxt) + err(MNT_EX_SYSERR, _("libmount context allocation failed")); + + mnt_context_set_tables_errcb(cxt, table_parser_errcb); + + while ((c = getopt_long(argc, argv, "aAcdfhilnqRrO:t:vVN:", + longopts, NULL)) != -1) { + + + /* only few options are allowed for non-root users */ + if (mnt_context_is_restricted(cxt) && !strchr("hdilqVv", c)) + exit_non_root(option_to_longopt(c, longopts)); + + err_exclusive_options(c, longopts, excl, excl_st); + + switch(c) { + case 'a': + all = 1; + break; + case 'A': + alltargets = 1; + break; + case 'c': + mnt_context_disable_canonicalize(cxt, TRUE); + break; + case 'd': + mnt_context_enable_loopdel(cxt, TRUE); + break; + case UMOUNT_OPT_FAKE: + mnt_context_enable_fake(cxt, TRUE); + break; + case 'f': + mnt_context_enable_force(cxt, TRUE); + break; + case 'h': + usage(); + break; + case 'i': + mnt_context_disable_helpers(cxt, TRUE); + break; + case 'l': + mnt_context_enable_lazy(cxt, TRUE); + break; + case 'n': + mnt_context_disable_mtab(cxt, TRUE); + break; + case 'q': + quiet = 1; + break; + case 'r': + mnt_context_enable_rdonly_umount(cxt, TRUE); + break; + case 'R': + recursive = TRUE; + break; + case 'O': + if (mnt_context_set_options_pattern(cxt, optarg)) + err(MNT_EX_SYSERR, _("failed to set options pattern")); + break; + case 't': + types = optarg; + break; + case 'v': + mnt_context_enable_verbose(cxt, TRUE); + break; + case 'V': + print_version(); + break; + case 'N': + { + char path[PATH_MAX]; + pid_t pid = parse_pid(optarg); + + if (pid) + snprintf(path, sizeof(path), "/proc/%i/ns/mnt", pid); + + if (mnt_context_set_target_ns(cxt, pid ? path : optarg)) + err(MNT_EX_SYSERR, _("failed to set target namespace to %s"), pid ? path : optarg); + break; + } + default: + errtryhelp(MNT_EX_USAGE); + } + } + + argc -= optind; + argv += optind; + + if (all) { + if (!types) + types = "noproc,nodevfs,nodevpts,nosysfs,norpc_pipefs,nonfsd,noselinuxfs"; + + mnt_context_set_fstype_pattern(cxt, types); + rc = umount_all(cxt); + + } else if (argc < 1) { + warnx(_("bad usage")); + errtryhelp(MNT_EX_USAGE); + + } else if (alltargets) { + while (argc--) + rc += umount_alltargets(cxt, *argv++, recursive); + } else if (recursive) { + while (argc--) + rc += umount_recursive(cxt, *argv++); + } else { + while (argc--) { + char *path = *argv; + + if (mnt_context_is_restricted(cxt) + && !mnt_tag_is_valid(path)) + path = sanitize_path(path); + + rc += umount_one(cxt, path); + + if (path != *argv) + free(path); + argv++; + } + } + + mnt_free_context(cxt); + return (rc < 256) ? rc : 255; +} + diff --git a/sys-utils/unshare.1 b/sys-utils/unshare.1 new file mode 100644 index 0000000..746c411 --- /dev/null +++ b/sys-utils/unshare.1 @@ -0,0 +1,266 @@ +.TH UNSHARE 1 "February 2016" "util-linux" "User Commands" +.SH NAME +unshare \- run program with some namespaces unshared from parent +.SH SYNOPSIS +.B unshare +[options] +.RI [ program +.RI [ arguments ]] +.SH DESCRIPTION +Unshares the indicated namespaces from the parent process and then executes +the specified \fIprogram\fR. If \fIprogram\fR is not given, then ``${SHELL}'' is +run (default: /bin/sh). +.PP +The namespaces can optionally be made persistent by bind mounting +/proc/\fIpid\fR/ns/\fItype\fR files to a filesystem path and entered with +.BR \%nsenter (1) +even after the \fIprogram\fR terminates (except PID namespaces where +permanently running init process is required). +Once a persistent \%namespace is no longer needed, it can be unpersisted with +.BR umount (8). +See the \fBEXAMPLES\fR section for more details. +.PP +The namespaces to be unshared are indicated via options. Unshareable namespaces are: +.TP +.B mount namespace +Mounting and unmounting filesystems will not affect the rest of the system, +except for filesystems which are explicitly marked as +shared (with \fBmount --make-shared\fP; see \fI/proc/self/mountinfo\fP or +\fBfindmnt -o+PROPAGATION\fP for the \fBshared\fP flags). +For further details, see +.BR mount_namespaces (7) +and the discussion of the +.B CLONE_NEWNS +flag in +.BR clone (2). +.sp +.B unshare +since util-linux version 2.27 automatically sets propagation to \fBprivate\fP +in a new mount namespace to make sure that the new namespace is really +unshared. It's possible to disable this feature with option +\fB\-\-propagation unchanged\fP. +Note that \fBprivate\fP is the kernel default. +.TP +.B UTS namespace +Setting hostname or domainname will not affect the rest of the system. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWUTS +flag in +.BR clone (2). +.TP +.B IPC namespace +The process will have an independent namespace for POSIX message queues +as well as System V \%message queues, +semaphore sets and shared memory segments. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWIPC +flag in +.BR clone (2). +.TP +.B network namespace +The process will have independent IPv4 and IPv6 stacks, IP routing tables, +firewall rules, the \fI/proc/net\fP and \fI/sys/class/net\fP directory trees, +sockets, etc. +For further details, see +.BR namespaces (7) +and the discussion of the +.B CLONE_NEWNET +flag in +.BR clone (2). +.TP +.B PID namespace +Children will have a distinct set of PID-to-process mappings from their parent. +For further details, see +.BR pid_namespaces (7) +and +the discussion of the +.B CLONE_NEWPID +flag in +.BR clone (2). +.TP +.B cgroup namespace +The process will have a virtualized view of \fI/proc\:/self\:/cgroup\fP, and new +cgroup mounts will be rooted at the namespace cgroup root. +For further details, see +.BR cgroup_namespaces (7) +and the discussion of the +.B CLONE_NEWCGROUP +flag in +.BR clone (2). +.TP +.B user namespace +The process will have a distinct set of UIDs, GIDs and capabilities. +For further details, see +.BR user_namespaces (7) +and the discussion of the +.B CLONE_NEWUSER +flag in +.BR clone (2). +.SH OPTIONS +.TP +.BR \-i , " \-\-ipc" [ =\fIfile ] +Unshare the IPC namespace. If \fIfile\fP is specified, then a persistent +namespace is created by a bind mount. +.TP +.BR \-m , " \-\-mount" [ =\fIfile ] +Unshare the mount namespace. If \fIfile\fP is specified, then a persistent +namespace is created by a bind mount. +Note that \fIfile\fP has to be located on a filesystem with the propagation +flag set to \fBprivate\fP. Use the command \fBfindmnt -o+PROPAGATION\fP +when not sure about the current setting. See also the examples below. +.TP +.BR \-n , " \-\-net" [ =\fIfile ] +Unshare the network namespace. If \fIfile\fP is specified, then a persistent +namespace is created by a bind mount. +.TP +.BR \-p , " \-\-pid" [ =\fIfile ] +Unshare the PID namespace. If \fIfile\fP is specified then persistent +namespace is created by a bind mount. See also the \fB--fork\fP and +\fB--mount-proc\fP options. +.TP +.BR \-u , " \-\-uts" [ =\fIfile ] +Unshare the UTS namespace. If \fIfile\fP is specified, then a persistent +namespace is created by a bind mount. +.TP +.BR \-U , " \-\-user" [ =\fIfile ] +Unshare the user namespace. If \fIfile\fP is specified, then a persistent +namespace is created by a bind mount. +.TP +.BR \-C , " \-\-cgroup"[=\fIfile\fP] +Unshare the cgroup namespace. If \fIfile\fP is specified then persistent namespace is created +by bind mount. +.TP +.BR \-f , " \-\-fork" +Fork the specified \fIprogram\fR as a child process of \fBunshare\fR rather than +running it directly. This is useful when creating a new PID namespace. +.TP +.BR \-\-kill\-child [ =\fIsigname ] +When \fBunshare\fR terminates, have \fIsigname\fP be sent to the forked child process. +Combined with \fB--pid\fR this allows for an easy and reliable killing of the entire +process tree below \fBunshare\fR. +If not given, \fIsigname\fP defaults to \fBSIGKILL\fR. +This option implies \fB--fork\fR. +.TP +.BR \-\-mount\-proc [ =\fImountpoint ] +Just before running the program, mount the proc filesystem at \fImountpoint\fP +(default is /proc). This is useful when creating a new PID namespace. It also +implies creating a new mount namespace since the /proc mount would otherwise +mess up existing programs on the system. The new proc filesystem is explicitly +mounted as private (with MS_PRIVATE|MS_REC). +.TP +.BR \-r , " \-\-map\-root\-user" +Run the program only after the current effective user and group IDs have been mapped to +the superuser UID and GID in the newly created user namespace. This makes it possible to +conveniently gain capabilities needed to manage various aspects of the newly created +namespaces (such as configuring interfaces in the network namespace or mounting filesystems in +the mount namespace) even when run unprivileged. As a mere convenience feature, it does not support +more sophisticated use cases, such as mapping multiple ranges of UIDs and GIDs. +This option implies \fB--setgroups=deny\fR. +.TP +.BR "\-\-propagation private" | shared | slave | unchanged +Recursively set the mount propagation flag in the new mount namespace. The default +is to set the propagation to \fIprivate\fP. It is possible to disable this feature +with the argument \fBunchanged\fR. The option is silently ignored when the mount +namespace (\fB\-\-mount\fP) is not requested. +.TP +.BR "\-\-setgroups allow" | deny +Allow or deny the +.BR setgroups (2) +system call in a user namespace. +.sp +To be able to call +.BR setgroups (2), +the calling process must at least have CAP_SETGID. +But since Linux 3.19 a further restriction applies: +the kernel gives permission to call +.BR \%setgroups (2) +only after the GID map (\fB/proc/\fIpid\fB/gid_map\fR) has been set. +The GID map is writable by root when +.BR \%setgroups (2) +is enabled (i.e. \fBallow\fR, the default), and +the GID map becomes writable by unprivileged processes when +.BR \%setgroups (2) +is permanently disabled (with \fBdeny\fR). +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH NOTES +The proc and sysfs filesystems mounting as root in a user namespace have to be +restricted so that a less privileged user can not get more access to sensitive +files that a more privileged user made unavailable. In short the rule for proc +and sysfs is as close to a bind mount as possible. +.SH EXAMPLES +.TP +.B # unshare --fork --pid --mount-proc readlink /proc/self +.TQ +1 +.br +Establish a PID namespace, ensure we're PID 1 in it against a newly mounted +procfs instance. +.TP +.B $ unshare --map-root-user --user sh -c whoami +.TQ +root +.br +Establish a user namespace as an unprivileged user with a root user within it. +.TP +.B # touch /root/uts-ns +.TQ +.B # unshare --uts=/root/uts-ns hostname FOO +.TQ +.B # nsenter --uts=/root/uts-ns hostname +.TQ +FOO +.TQ +.B # umount /root/uts-ns +.br +Establish a persistent UTS namespace, and modify the hostname. The namespace +is then entered with \fBnsenter\fR. The namespace is destroyed by unmounting +the bind reference. +.TP +.B # mount --bind /root/namespaces /root/namespaces +.TQ +.B # mount --make-private /root/namespaces +.TQ +.B # touch /root/namespaces/mnt +.TQ +.B # unshare --mount=/root/namespaces/mnt +.br +Establish a persistent mount namespace referenced by the bind mount +/root/namespaces/mnt. This example shows a portable solution, because it +makes sure that the bind mount is created on a shared filesystem. +.TP +.B # unshare -pf --kill-child -- bash -c "(sleep 999 &) && sleep 1000" & +.TQ +.B # pid=$! +.TQ +.B # kill $pid +.br +Reliable killing of subprocesses of the \fIprogram\fR. +When \fBunshare\fR gets killed, everything below it gets killed as well. +Without it, the children of \fIprogram\fR would have orphaned and +been re-parented to PID 1. + +.SH SEE ALSO +.BR clone (2), +.BR unshare (2), +.BR namespaces (7), +.BR mount (8) +.SH AUTHORS +.UR dottedmag@dottedmag.net +Mikhail Gusarov +.UE +.br +.UR kzak@redhat.com +Karel Zak +.UE +.SH AVAILABILITY +The unshare command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/unshare.c b/sys-utils/unshare.c new file mode 100644 index 0000000..661665a --- /dev/null +++ b/sys-utils/unshare.c @@ -0,0 +1,484 @@ +/* + * unshare(1) - command-line interface for unshare(2) + * + * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <errno.h> +#include <getopt.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/wait.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/prctl.h> + +/* we only need some defines missing in sys/mount.h, no libmount linkage */ +#include <libmount.h> + +#include "nls.h" +#include "c.h" +#include "closestream.h" +#include "namespace.h" +#include "exec_shell.h" +#include "xalloc.h" +#include "pathnames.h" +#include "all-io.h" +#include "signames.h" + +/* synchronize parent and child by pipe */ +#define PIPE_SYNC_BYTE 0x06 + +/* 'private' is kernel default */ +#define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE) + +/* /proc namespace files and mountpoints for binds */ +static struct namespace_file { + int type; /* CLONE_NEW* */ + const char *name; /* ns/<type> */ + const char *target; /* user specified target for bind mount */ +} namespace_files[] = { + { .type = CLONE_NEWUSER, .name = "ns/user" }, + { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" }, + { .type = CLONE_NEWIPC, .name = "ns/ipc" }, + { .type = CLONE_NEWUTS, .name = "ns/uts" }, + { .type = CLONE_NEWNET, .name = "ns/net" }, + { .type = CLONE_NEWPID, .name = "ns/pid" }, + { .type = CLONE_NEWNS, .name = "ns/mnt" }, + { .name = NULL } +}; + +static int npersists; /* number of persistent namespaces */ + + +enum { + SETGROUPS_NONE = -1, + SETGROUPS_DENY = 0, + SETGROUPS_ALLOW = 1, +}; + +static const char *setgroups_strings[] = +{ + [SETGROUPS_DENY] = "deny", + [SETGROUPS_ALLOW] = "allow" +}; + +static int setgroups_str2id(const char *str) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++) + if (strcmp(str, setgroups_strings[i]) == 0) + return i; + + errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str); +} + +static void setgroups_control(int action) +{ + const char *file = _PATH_PROC_SETGROUPS; + const char *cmd; + int fd; + + if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings)) + return; + cmd = setgroups_strings[action]; + + fd = open(file, O_WRONLY); + if (fd < 0) { + if (errno == ENOENT) + return; + err(EXIT_FAILURE, _("cannot open %s"), file); + } + + if (write_all(fd, cmd, strlen(cmd))) + err(EXIT_FAILURE, _("write failed %s"), file); + close(fd); +} + +static void map_id(const char *file, uint32_t from, uint32_t to) +{ + char *buf; + int fd; + + fd = open(file, O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), file); + + xasprintf(&buf, "%u %u 1", from, to); + if (write_all(fd, buf, strlen(buf))) + err(EXIT_FAILURE, _("write failed %s"), file); + free(buf); + close(fd); +} + +static unsigned long parse_propagation(const char *str) +{ + size_t i; + static const struct prop_opts { + const char *name; + unsigned long flag; + } opts[] = { + { "slave", MS_REC | MS_SLAVE }, + { "private", MS_REC | MS_PRIVATE }, + { "shared", MS_REC | MS_SHARED }, + { "unchanged", 0 } + }; + + for (i = 0; i < ARRAY_SIZE(opts); i++) { + if (strcmp(opts[i].name, str) == 0) + return opts[i].flag; + } + + errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str); +} + +static void set_propagation(unsigned long flags) +{ + if (flags == 0) + return; + + if (mount("none", "/", NULL, flags, NULL) != 0) + err(EXIT_FAILURE, _("cannot change root filesystem propagation")); +} + + +static int set_ns_target(int type, const char *path) +{ + struct namespace_file *ns; + + for (ns = namespace_files; ns->name; ns++) { + if (ns->type != type) + continue; + ns->target = path; + npersists++; + return 0; + } + + return -EINVAL; +} + +static int bind_ns_files(pid_t pid) +{ + struct namespace_file *ns; + char src[PATH_MAX]; + + for (ns = namespace_files; ns->name; ns++) { + if (!ns->target) + continue; + + snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name); + + if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0) + err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target); + } + + return 0; +} + +static ino_t get_mnt_ino(pid_t pid) +{ + struct stat st; + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid); + + if (stat(path, &st) != 0) + err(EXIT_FAILURE, _("cannot stat %s"), path); + return st.st_ino; +} + +static void bind_ns_files_from_child(pid_t *child, int fds[2]) +{ + char ch; + pid_t ppid = getpid(); + ino_t ino = get_mnt_ino(ppid); + + if (pipe(fds) < 0) + err(EXIT_FAILURE, _("pipe failed")); + + *child = fork(); + + switch (*child) { + case -1: + err(EXIT_FAILURE, _("fork failed")); + + case 0: /* child */ + close(fds[1]); + fds[1] = -1; + + /* wait for parent */ + if (read_all(fds[0], &ch, 1) != 1 && ch != PIPE_SYNC_BYTE) + err(EXIT_FAILURE, _("failed to read pipe")); + if (get_mnt_ino(ppid) == ino) + exit(EXIT_FAILURE); + bind_ns_files(ppid); + exit(EXIT_SUCCESS); + break; + + default: /* parent */ + close(fds[0]); + fds[0] = -1; + break; + } +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run a program with some namespaces unshared from the parent.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out); + fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out); + fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out); + fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out); + fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out); + fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out); + fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out); + fputs(USAGE_SEPARATOR, out); + fputs(_(" -f, --fork fork before launching <program>\n"), out); + fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out); + fputs(USAGE_SEPARATOR, out); + fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n" + " defaults to SIGKILL\n"), out); + fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out); + fputs(_(" --propagation slave|shared|private|unchanged\n" + " modify mount propagation in mount namespace\n"), out); + fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(27)); + printf(USAGE_MAN_TAIL("unshare(1)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + enum { + OPT_MOUNTPROC = CHAR_MAX + 1, + OPT_PROPAGATION, + OPT_SETGROUPS, + OPT_KILLCHILD + }; + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + + { "mount", optional_argument, NULL, 'm' }, + { "uts", optional_argument, NULL, 'u' }, + { "ipc", optional_argument, NULL, 'i' }, + { "net", optional_argument, NULL, 'n' }, + { "pid", optional_argument, NULL, 'p' }, + { "user", optional_argument, NULL, 'U' }, + { "cgroup", optional_argument, NULL, 'C' }, + + { "fork", no_argument, NULL, 'f' }, + { "kill-child", optional_argument, NULL, OPT_KILLCHILD }, + { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC }, + { "map-root-user", no_argument, NULL, 'r' }, + { "propagation", required_argument, NULL, OPT_PROPAGATION }, + { "setgroups", required_argument, NULL, OPT_SETGROUPS }, + { NULL, 0, NULL, 0 } + }; + + int setgrpcmd = SETGROUPS_NONE; + int unshare_flags = 0; + int c, forkit = 0, maproot = 0; + int kill_child_signo = 0; /* 0 means --kill-child was not used */ + const char *procmnt = NULL; + pid_t pid = 0; + int fds[2]; + int status; + unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT; + uid_t real_euid = geteuid(); + gid_t real_egid = getegid(); + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "+fhVmuinpCUr", longopts, NULL)) != -1) { + switch (c) { + case 'f': + forkit = 1; + break; + case 'h': + usage(); + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'm': + unshare_flags |= CLONE_NEWNS; + if (optarg) + set_ns_target(CLONE_NEWNS, optarg); + break; + case 'u': + unshare_flags |= CLONE_NEWUTS; + if (optarg) + set_ns_target(CLONE_NEWUTS, optarg); + break; + case 'i': + unshare_flags |= CLONE_NEWIPC; + if (optarg) + set_ns_target(CLONE_NEWIPC, optarg); + break; + case 'n': + unshare_flags |= CLONE_NEWNET; + if (optarg) + set_ns_target(CLONE_NEWNET, optarg); + break; + case 'p': + unshare_flags |= CLONE_NEWPID; + if (optarg) + set_ns_target(CLONE_NEWPID, optarg); + break; + case 'U': + unshare_flags |= CLONE_NEWUSER; + if (optarg) + set_ns_target(CLONE_NEWUSER, optarg); + break; + case 'C': + unshare_flags |= CLONE_NEWCGROUP; + if (optarg) + set_ns_target(CLONE_NEWCGROUP, optarg); + break; + case OPT_MOUNTPROC: + unshare_flags |= CLONE_NEWNS; + procmnt = optarg ? optarg : "/proc"; + break; + case 'r': + unshare_flags |= CLONE_NEWUSER; + maproot = 1; + break; + case OPT_SETGROUPS: + setgrpcmd = setgroups_str2id(optarg); + break; + case OPT_PROPAGATION: + propagation = parse_propagation(optarg); + break; + case OPT_KILLCHILD: + forkit = 1; + if (optarg) { + if ((kill_child_signo = signame_to_signum(optarg)) < 0) + errx(EXIT_FAILURE, _("unknown signal: %s"), + optarg); + } else { + kill_child_signo = SIGKILL; + } + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (npersists && (unshare_flags & CLONE_NEWNS)) + bind_ns_files_from_child(&pid, fds); + + if (-1 == unshare(unshare_flags)) + err(EXIT_FAILURE, _("unshare failed")); + + if (npersists) { + if (pid && (unshare_flags & CLONE_NEWNS)) { + int rc; + char ch = PIPE_SYNC_BYTE; + + /* signal child we are ready */ + write_all(fds[1], &ch, 1); + close(fds[1]); + fds[1] = -1; + + /* wait for bind_ns_files_from_child() */ + do { + rc = waitpid(pid, &status, 0); + if (rc < 0) { + if (errno == EINTR) + continue; + err(EXIT_FAILURE, _("waitpid failed")); + } + if (WIFEXITED(status) && + WEXITSTATUS(status) != EXIT_SUCCESS) + return WEXITSTATUS(status); + } while (rc < 0); + } else + /* simple way, just bind */ + bind_ns_files(getpid()); + } + + if (forkit) { + pid = fork(); + + switch(pid) { + case -1: + err(EXIT_FAILURE, _("fork failed")); + case 0: /* child */ + break; + default: /* parent */ + if (waitpid(pid, &status, 0) == -1) + err(EXIT_FAILURE, _("waitpid failed")); + if (WIFEXITED(status)) + return WEXITSTATUS(status); + else if (WIFSIGNALED(status)) + kill(getpid(), WTERMSIG(status)); + err(EXIT_FAILURE, _("child exit failed")); + } + } + + if (kill_child_signo != 0 && prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0) + err(EXIT_FAILURE, "prctl failed"); + + if (maproot) { + if (setgrpcmd == SETGROUPS_ALLOW) + errx(EXIT_FAILURE, _("options --setgroups=allow and " + "--map-root-user are mutually exclusive")); + + /* since Linux 3.19 unprivileged writing of /proc/self/gid_map + * has s been disabled unless /proc/self/setgroups is written + * first to permanently disable the ability to call setgroups + * in that user namespace. */ + setgroups_control(SETGROUPS_DENY); + map_id(_PATH_PROC_UIDMAP, 0, real_euid); + map_id(_PATH_PROC_GIDMAP, 0, real_egid); + + } else if (setgrpcmd != SETGROUPS_NONE) + setgroups_control(setgrpcmd); + + if ((unshare_flags & CLONE_NEWNS) && propagation) + set_propagation(propagation); + + if (procmnt && + (mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL) != 0 || + mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)) + err(EXIT_FAILURE, _("mount %s failed"), procmnt); + + if (optind < argc) { + execvp(argv[optind], argv + optind); + errexec(argv[optind]); + } + exec_shell(); +} diff --git a/sys-utils/wdctl.8 b/sys-utils/wdctl.8 new file mode 100644 index 0000000..7edf808 --- /dev/null +++ b/sys-utils/wdctl.8 @@ -0,0 +1,70 @@ +.\" wdctl.8 -- +.\" Copyright (C) 2012 Karel Zak <kzak@redhat.com> +.\" May be distributed under the GNU General Public License +.TH WDCTL "8" "July 2014" "util-linux" "System Administration" +.SH NAME +wdctl \- show hardware watchdog status +.SH SYNOPSIS +.B wdctl +[options] +.RI [ device ...] +.SH DESCRIPTION +Show hardware watchdog status. The default device is +.IR /dev/watchdog . +If more than one device is specified then the output is separated by +one blank line. +.PP +Note that the number of supported watchdog features is hardware specific. +.SH OPTIONS +.TP +.BR \-f , " \-\-flags " \fIlist +Print only the specified flags. +.TP +.BR \-F , " \-\-noflags" +Do not print information about flags. +.TP +.BR \-I , " \-\-noident" +Do not print watchdog identity information. +.TP +.BR \-n , " \-\-noheadings" +Do not print a header line for flags table. +.IP "\fB\-o\fR, \fB\-\-output \fIlist\fP" +Define the output columns to use in table of watchdog flags. If no +output arrangement is specified, then a default set is used. Use +.B \-\-help +to get list of all supported columns. +.TP +.BR \-O , " \-\-oneline" +Print all wanted information on one line in key="value" output format. +.TP +.BR \-r , " \-\-raw" +Use the raw output format. +.TP +.BR \-s , " \-settimeout " \fIseconds +Set the watchdog timeout in seconds. +.TP +.BR \-T , " \-\-notimeouts" +Do not print watchdog timeouts. +.IP "\fB\-x\fR, \fB\-\-flags\-only\fP" +Same as \fB\-I \-T\fP. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. +.SH AUTHORS +.MT kzak@\:redhat\:.com +Karel Zak +.ME +.br +.MT lennart@\:poettering\:.net +Lennart Poettering +.ME +.SH AVAILABILITY +The +.B wdctl +command is part of the util-linux package and is available from +.UR https://\:www.kernel.org\:/pub\:/linux\:/utils\:/util-linux/ +Linux Kernel Archive +.UE . diff --git a/sys-utils/wdctl.c b/sys-utils/wdctl.c new file mode 100644 index 0000000..642db85 --- /dev/null +++ b/sys-utils/wdctl.c @@ -0,0 +1,618 @@ +/* + * wdctl(8) - show hardware watchdog status + * + * Copyright (C) 2012 Lennart Poettering + * Copyright (C) 2012 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <sys/ioctl.h> +#include <getopt.h> +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <assert.h> +#include <linux/watchdog.h> + +#include <libsmartcols.h> + +#include "nls.h" +#include "c.h" +#include "xalloc.h" +#include "closestream.h" +#include "optutils.h" +#include "pathnames.h" +#include "strutils.h" +#include "carefulputc.h" + +/* + * since 2.6.18 + */ +#ifndef WDIOC_SETPRETIMEOUT +# define WDIOC_SETPRETIMEOUT _IOWR(WATCHDOG_IOCTL_BASE, 8, int) +# define WDIOC_GETPRETIMEOUT _IOR(WATCHDOG_IOCTL_BASE, 9, int) +# define WDIOC_GETTIMELEFT _IOR(WATCHDOG_IOCTL_BASE, 10, int) +# define WDIOF_POWEROVER 0x0040 /* Power over voltage */ +# define WDIOF_SETTIMEOUT 0x0080 /* Set timeout (in seconds) */ +# define WDIOF_MAGICCLOSE 0x0100 /* Supports magic close char */ +# define WDIOF_PRETIMEOUT 0x0200 /* Pretimeout (in seconds), get/set */ +# define WDIOF_KEEPALIVEPING 0x8000 /* Keep alive ping reply */ +#endif + +/* + * since 3.5 + */ +#ifndef WDIOF_ALARMONLY +# define WDIOF_ALARMONLY 0x0400 /* Watchdog triggers a management or + other external alarm not a reboot */ +#endif + +/* basic output flags */ +static int no_headings; +static int raw; + +struct wdflag { + uint32_t flag; + const char *name; + const char *description; +}; + +static const struct wdflag wdflags[] = { + { WDIOF_CARDRESET, "CARDRESET", N_("Card previously reset the CPU") }, + { WDIOF_EXTERN1, "EXTERN1", N_("External relay 1") }, + { WDIOF_EXTERN2, "EXTERN2", N_("External relay 2") }, + { WDIOF_FANFAULT, "FANFAULT", N_("Fan failed") }, + { WDIOF_KEEPALIVEPING, "KEEPALIVEPING", N_("Keep alive ping reply") }, + { WDIOF_MAGICCLOSE, "MAGICCLOSE", N_("Supports magic close char") }, + { WDIOF_OVERHEAT, "OVERHEAT", N_("Reset due to CPU overheat") }, + { WDIOF_POWEROVER, "POWEROVER", N_("Power over voltage") }, + { WDIOF_POWERUNDER, "POWERUNDER", N_("Power bad/power fault") }, + { WDIOF_PRETIMEOUT, "PRETIMEOUT", N_("Pretimeout (in seconds)") }, + { WDIOF_SETTIMEOUT, "SETTIMEOUT", N_("Set timeout (in seconds)") }, + { WDIOF_ALARMONLY, "ALARMONLY", N_("Not trigger reboot") } +}; + + +/* column names */ +struct colinfo { + const char *name; /* header */ + double whint; /* width hint (N < 1 is in percent of termwidth) */ + int flags; /* SCOLS_FL_* */ + const char *help; +}; + +enum { COL_FLAG, COL_DESC, COL_STATUS, COL_BSTATUS, COL_DEVICE }; + +/* columns descriptions */ +static struct colinfo infos[] = { + [COL_FLAG] = { "FLAG", 14, 0, N_("flag name") }, + [COL_DESC] = { "DESCRIPTION", 0.1, SCOLS_FL_TRUNC, N_("flag description") }, + [COL_STATUS] = { "STATUS", 1, SCOLS_FL_RIGHT, N_("flag status") }, + [COL_BSTATUS] = { "BOOT-STATUS", 1, SCOLS_FL_RIGHT, N_("flag boot status") }, + [COL_DEVICE] = { "DEVICE", 0.1, 0, N_("watchdog device name") } + +}; + +static int columns[ARRAY_SIZE(infos) * 2]; +static int ncolumns; + +struct wdinfo { + char *device; + + int timeout; + int timeleft; + int pretimeout; + + uint32_t status; + uint32_t bstatus; + + struct watchdog_info ident; + + unsigned int has_timeout : 1, + has_timeleft : 1, + has_pretimeout : 1; +}; + +/* converts flag name to flag bit */ +static long name2bit(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(wdflags); i++) { + const char *cn = wdflags[i].name; + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return wdflags[i].flag; + } + warnx(_("unknown flag: %s"), name); + return -1; +} + +static int column2id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static int get_column_id(int num) +{ + assert(num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(infos)); + + return columns[num]; +} + +static struct colinfo *get_column_info(unsigned num) +{ + return &infos[ get_column_id(num) ]; +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, + _(" %s [options] [<device> ...]\n"), program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Show the status of the hardware watchdog.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -f, --flags <list> print selected flags only\n" + " -F, --noflags don't print information about flags\n" + " -I, --noident don't print watchdog identity information\n" + " -n, --noheadings don't print headings for flags table\n" + " -O, --oneline print all information on one line\n" + " -o, --output <list> output columns of the flags\n" + " -r, --raw use raw output format for flags table\n" + " -T, --notimeouts don't print watchdog timeouts\n" + " -s, --settimeout <sec> set watchdog timeout\n" + " -x, --flags-only print only flags table (same as -I -T)\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(24)); + fputs(USAGE_SEPARATOR, out); + + fprintf(out, _("The default device is %s.\n"), _PATH_WATCHDOG_DEV); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %13s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("wdctl(8)")); + + exit(EXIT_SUCCESS); +} + +static void add_flag_line(struct libscols_table *table, struct wdinfo *wd, const struct wdflag *fl) +{ + int i; + struct libscols_line *line; + + line = scols_table_new_line(table, NULL); + if (!line) { + warn(_("failed to allocate output line")); + return; + } + + for (i = 0; i < ncolumns; i++) { + const char *str = NULL; + + switch (get_column_id(i)) { + case COL_FLAG: + str = fl->name; + break; + case COL_DESC: + str = fl->description; + break; + case COL_STATUS: + str = wd->status & fl->flag ? "1" : "0"; + break; + case COL_BSTATUS: + str = wd->bstatus & fl->flag ? "1" : "0"; + break; + case COL_DEVICE: + str = wd->device; + break; + default: + break; + } + + if (str && scols_line_set_data(line, i, str)) { + warn(_("failed to add output data")); + break; + } + } +} + +static int show_flags(struct wdinfo *wd, uint32_t wanted) +{ + size_t i; + int rc = -1; + struct libscols_table *table; + uint32_t flags; + + scols_init_debug(0); + + /* create output table */ + table = scols_new_table(); + if (!table) { + warn(_("failed to allocate output table")); + return -1; + } + scols_table_enable_raw(table, raw); + scols_table_enable_noheadings(table, no_headings); + + /* define columns */ + for (i = 0; i < (size_t) ncolumns; i++) { + struct colinfo *col = get_column_info(i); + + if (!scols_table_new_column(table, col->name, col->whint, col->flags)) { + warnx(_("failed to allocate output column")); + goto done; + } + } + + /* fill-in table with data + * -- one line for each supported flag (option) */ + flags = wd->ident.options; + + for (i = 0; i < ARRAY_SIZE(wdflags); i++) { + if (wanted && !(wanted & wdflags[i].flag)) + ; /* ignore */ + else if (flags & wdflags[i].flag) + add_flag_line(table, wd, &wdflags[i]); + + flags &= ~wdflags[i].flag; + } + + if (flags) + warnx(_("%s: unknown flags 0x%x\n"), wd->device, flags); + + scols_print_table(table); + rc = 0; +done: + scols_unref_table(table); + return rc; +} +/* + * Warning: successfully opened watchdog has to be properly closed with magic + * close character otherwise the machine will be rebooted! + * + * Don't use err() or exit() here! + */ +static int set_watchdog(struct wdinfo *wd, int timeout) +{ + int fd; + sigset_t sigs, oldsigs; + int rc = 0; + + assert(wd->device); + + sigemptyset(&oldsigs); + sigfillset(&sigs); + sigprocmask(SIG_BLOCK, &sigs, &oldsigs); + + fd = open(wd->device, O_WRONLY|O_CLOEXEC); + + if (fd < 0) { + if (errno == EBUSY) + warnx(_("%s: watchdog already in use, terminating."), + wd->device); + warn(_("cannot open %s"), wd->device); + return -1; + } + + for (;;) { + /* We just opened this to query the state, not to arm + * it hence use the magic close character */ + static const char v = 'V'; + + if (write(fd, &v, 1) >= 0) + break; + if (errno != EINTR) { + warn(_("%s: failed to disarm watchdog"), wd->device); + break; + } + /* Let's try hard, since if we don't get this right + * the machine might end up rebooting. */ + } + + if (ioctl(fd, WDIOC_SETTIMEOUT, &timeout) != 0) { + rc = errno; + warn(_("cannot set timeout for %s"), wd->device); + } + + if (close(fd)) + warn(_("write failed")); + sigprocmask(SIG_SETMASK, &oldsigs, NULL); + printf(P_("Timeout has been set to %d second.\n", + "Timeout has been set to %d seconds.\n", timeout), timeout); + + return rc; +} + +/* + * Warning: successfully opened watchdog has to be properly closed with magic + * close character otherwise the machine will be rebooted! + * + * Don't use err() or exit() here! + */ +static int read_watchdog(struct wdinfo *wd) +{ + int fd; + sigset_t sigs, oldsigs; + + assert(wd->device); + + sigemptyset(&oldsigs); + sigfillset(&sigs); + sigprocmask(SIG_BLOCK, &sigs, &oldsigs); + + fd = open(wd->device, O_WRONLY|O_CLOEXEC); + + if (fd < 0) { + if (errno == EBUSY) + warnx(_("%s: watchdog already in use, terminating."), + wd->device); + warn(_("cannot open %s"), wd->device); + return -1; + } + + if (ioctl(fd, WDIOC_GETSUPPORT, &wd->ident) < 0) + warn(_("%s: failed to get information about watchdog"), wd->device); + else { + ioctl(fd, WDIOC_GETSTATUS, &wd->status); + ioctl(fd, WDIOC_GETBOOTSTATUS, &wd->bstatus); + + if (ioctl(fd, WDIOC_GETTIMEOUT, &wd->timeout) >= 0) + wd->has_timeout = 1; + if (ioctl(fd, WDIOC_GETPRETIMEOUT, &wd->pretimeout) >= 0) + wd->has_pretimeout = 1; + if (ioctl(fd, WDIOC_GETTIMELEFT, &wd->timeleft) >= 0) + wd->has_timeleft = 1; + } + + for (;;) { + /* We just opened this to query the state, not to arm + * it hence use the magic close character */ + static const char v = 'V'; + + if (write(fd, &v, 1) >= 0) + break; + if (errno != EINTR) { + warn(_("%s: failed to disarm watchdog"), wd->device); + break; + } + /* Let's try hard, since if we don't get this right + * the machine might end up rebooting. */ + } + + if (close(fd)) + warn(_("write failed")); + sigprocmask(SIG_SETMASK, &oldsigs, NULL); + + return 0; +} + +static void print_oneline(struct wdinfo *wd, uint32_t wanted, + int noident, int notimeouts, int noflags) +{ + printf("%s:", wd->device); + + if (!noident) { + printf(" VERSION=\"%x\"", wd->ident.firmware_version); + + printf(" IDENTITY="); + fputs_quoted((char *) wd->ident.identity, stdout); + } + if (!notimeouts) { + if (wd->has_timeout) + printf(" TIMEOUT=\"%i\"", wd->timeout); + if (wd->has_pretimeout) + printf(" PRETIMEOUT=\"%i\"", wd->pretimeout); + if (wd->has_timeleft) + printf(" TIMELEFT=\"%i\"", wd->timeleft); + } + + if (!noflags) { + size_t i; + uint32_t flags = wd->ident.options; + + for (i = 0; i < ARRAY_SIZE(wdflags); i++) { + const struct wdflag *fl; + + if ((wanted && !(wanted & wdflags[i].flag)) || + !(flags & wdflags[i].flag)) + continue; + + fl= &wdflags[i]; + + printf(" %s=\"%s\"", fl->name, + wd->status & fl->flag ? "1" : "0"); + printf(" %s_BOOT=\"%s\"", fl->name, + wd->bstatus & fl->flag ? "1" : "0"); + + } + } + + fputc('\n', stdout); +} + +static void show_timeouts(struct wdinfo *wd) +{ + if (wd->has_timeout) + printf(P_("%-14s %2i second\n", "%-14s %2i seconds\n", wd->timeout), + _("Timeout:"), wd->timeout); + if (wd->has_pretimeout) + printf(P_("%-14s %2i second\n", "%-14s %2i seconds\n", wd->pretimeout), + _("Pre-timeout:"), wd->pretimeout); + if (wd->has_timeleft) + printf(P_("%-14s %2i second\n", "%-14s %2i seconds\n", wd->timeleft), + _("Timeleft:"), wd->timeleft); +} + +int main(int argc, char *argv[]) +{ + struct wdinfo wd; + int c, res = EXIT_SUCCESS, count = 0; + char noflags = 0, noident = 0, notimeouts = 0, oneline = 0; + uint32_t wanted = 0; + int timeout = 0; + + static const struct option long_opts[] = { + { "flags", required_argument, NULL, 'f' }, + { "flags-only", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + { "noflags", no_argument, NULL, 'F' }, + { "noheadings", no_argument, NULL, 'n' }, + { "noident", no_argument, NULL, 'I' }, + { "notimeouts", no_argument, NULL, 'T' }, + { "settimeout", required_argument, NULL, 's' }, + { "output", required_argument, NULL, 'o' }, + { "oneline", no_argument, NULL, 'O' }, + { "raw", no_argument, NULL, 'r' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { /* rows and cols in ASCII order */ + { 'F','f' }, /* noflags,flags*/ + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, + "d:f:hFnITo:s:OrVx", long_opts, NULL)) != -1) { + + err_exclusive_options(c, long_opts, excl, excl_st); + + switch(c) { + case 'o': + ncolumns = string_to_idarray(optarg, + columns, ARRAY_SIZE(columns), + column2id); + if (ncolumns < 0) + return EXIT_FAILURE; + break; + case 's': + timeout = strtos32_or_err(optarg, _("invalid timeout argument")); + break; + case 'f': + if (string_to_bitmask(optarg, (unsigned long *) &wanted, name2bit) != 0) + return EXIT_FAILURE; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + case 'F': + noflags = 1; + break; + case 'I': + noident = 1; + break; + case 'T': + notimeouts = 1; + break; + case 'n': + no_headings = 1; + break; + case 'r': + raw = 1; + break; + case 'O': + oneline = 1; + break; + case 'x': + noident = 1; + notimeouts = 1; + break; + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (!ncolumns) { + /* default columns */ + columns[ncolumns++] = COL_FLAG; + columns[ncolumns++] = COL_DESC; + columns[ncolumns++] = COL_STATUS; + columns[ncolumns++] = COL_BSTATUS; + } + + do { + int rc; + + memset(&wd, 0, sizeof(wd)); + + if (optind == argc) + wd.device = _PATH_WATCHDOG_DEV; + else + wd.device = argv[optind++]; + + if (count) + fputc('\n', stdout); + count++; + + if (timeout) { + rc = set_watchdog(&wd, timeout); + if (rc) { + res = EXIT_FAILURE; + } + } + + rc = read_watchdog(&wd); + if (rc) { + res = EXIT_FAILURE; + continue; + } + + if (oneline) { + print_oneline(&wd, wanted, noident, notimeouts, noflags); + continue; + } + + /* pretty output */ + if (!noident) { + printf("%-15s%s\n", _("Device:"), wd.device); + printf("%-15s%s [%s %x]\n", + _("Identity:"), + wd.ident.identity, + _("version"), + wd.ident.firmware_version); + } + if (!notimeouts) + show_timeouts(&wd); + if (!noflags) + show_flags(&wd, wanted); + } while (optind < argc); + + return res; +} diff --git a/sys-utils/zramctl.8 b/sys-utils/zramctl.8 new file mode 100644 index 0000000..c6ecdc3 --- /dev/null +++ b/sys-utils/zramctl.8 @@ -0,0 +1,131 @@ +.TH ZRAMCTL 8 "July 2014" "util-linux" "System Administration" +.SH NAME +zramctl \- set up and control zram devices +.SH SYNOPSIS +.ad l +Get info: +.sp +.in +5 +.BR zramctl " [options]" +.sp +.in -5 +Reset zram: +.sp +.in +5 +.B "zramctl \-r" +.IR zramdev ... +.sp +.in -5 +Print name of first unused zram device: +.sp +.in +5 +.B "zramctl \-f" +.sp +.in -5 +Set up a zram device: +.sp +.in +5 +.B zramctl +.RB [ \-f " | "\fIzramdev\fP ] +.RB [ \-s +.IR size ] +.RB [ \-t +.IR number ] +.RB [ \-a +.IR algorithm ] +.sp +.in -5 +.ad b +.SH DESCRIPTION +.B zramctl +is used to quickly set up zram device parameters, to reset zram devices, and to +query the status of used zram devices. +.PP +If no option is given, all non-zero size zram devices are shown. +.PP +Note that \fIzramdev\fP node specified on command line has to already exist. The command +.B zramctl +creates a new /dev/zram<N> nodes only when \fB\-\-find\fR option specified. It's possible +(and common) that after system boot /dev/zram<N> nodes are not created yet. +.SH OPTIONS +.TP +.BR \-a , " \-\-algorithm lzo" | lz4 | lz4hc | deflate | 842 +Set the compression algorithm to be used for compressing data in the zram device. +.TP +.BR \-f , " \-\-find" +Find the first unused zram device. If a \fB\-\-size\fR argument is present, then +initialize the device. +.TP +.BR \-n , " \-\-noheadings" +Do not print a header line in status output. +.TP +.BR \-o , " \-\-output " \fIlist +Define the status output columns to be used. If no output arrangement is +specified, then a default set is used. +Use \fB\-\-help\fP to get a list of all supported columns. +.TP +.B \-\-output\-all +Output all available columns. +.TP +.B \-\-raw +Use the raw format for status output. +.TP +.BR \-r , " \-\-reset" +Reset the options of the specified zram device(s). Zram device settings +can be changed only after a reset. +.TP +.BR \-s , " \-\-size " \fIsize +Create a zram device of the specified \fIsize\fR. +Zram devices are aligned to memory pages; when the requested \fIsize\fR is +not a multiple of the page size, it will be rounded up to the next multiple. +When not otherwise specified, the unit of the \fIsize\fR parameter is bytes. +.IP +The \fIsize\fR argument may be followed by the multiplicative suffixes KiB (=1024), +MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" +is optional, e.g., "K" has the same meaning as "KiB") or the suffixes +KB (=1000), MB (=1000*1000), and so on for GB, TB, PB, EB, ZB and YB. +.TP +.BR \-t , " \-\-streams " \fInumber +Set the maximum number of compression streams that can be used for the device. +The default is one stream. +.TP +.BR \-V , " \-\-version" +Display version information and exit. +.TP +.BR \-h , " \-\-help" +Display help text and exit. + +.SH RETURN VALUE +.B zramctl +returns 0 on success, nonzero on failure. + +.SH FILES +.TP +.I /dev/zram[0..N] +zram block devices + +.SH EXAMPLE +The following commands set up a zram device with a size of one gigabyte +and use it as swap device. +.nf +.IP +# zramctl --find --size 1024M +/dev/zram0 +# mkswap /dev/zram0 +# swapon /dev/zram0 + ... +# swapoff /dev/zram0 +# zramctl --reset /dev/zram0 +.fi +.SH SEE ALSO +.UR http://git.\:kernel.\:org\:/cgit\:/linux\:/kernel\:/git\:/torvalds\:/linux.git\:/tree\:/Documentation\:/blockdev\:/zram.txt +Linux kernel documentation +.UE . +.SH AUTHORS +.nf +Timofey Titovets <nefelim4ag@gmail.com> +Karel Zak <kzak@redhat.com> +.fi +.SH AVAILABILITY +The zramctl command is part of the util-linux package and is available from +https://www.kernel.org/pub/linux/utils/util-linux/. diff --git a/sys-utils/zramctl.c b/sys-utils/zramctl.c new file mode 100644 index 0000000..69267c8 --- /dev/null +++ b/sys-utils/zramctl.c @@ -0,0 +1,765 @@ +/* + * zramctl - control compressed block devices in RAM + * + * Copyright (c) 2014 Timofey Titovets <Nefelim4ag@gmail.com> + * Copyright (C) 2014 Karel Zak <kzak@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <getopt.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <assert.h> +#include <sys/types.h> +#include <dirent.h> + +#include <libsmartcols.h> + +#include "c.h" +#include "nls.h" +#include "closestream.h" +#include "strutils.h" +#include "xalloc.h" +#include "sysfs.h" +#include "optutils.h" +#include "ismounted.h" +#include "strv.h" +#include "path.h" +#include "pathnames.h" + +/*#define CONFIG_ZRAM_DEBUG*/ + +#ifdef CONFIG_ZRAM_DEBUG +# define DBG(x) do { fputs("zram: ", stderr); x; fputc('\n', stderr); } while(0) +#else +# define DBG(x) +#endif + +/* status output columns */ +struct colinfo { + const char *name; + double whint; + int flags; + const char *help; +}; + +enum { + COL_NAME = 0, + COL_DISKSIZE, + COL_ORIG_SIZE, + COL_COMP_SIZE, + COL_ALGORITHM, + COL_STREAMS, + COL_ZEROPAGES, + COL_MEMTOTAL, + COL_MEMLIMIT, + COL_MEMUSED, + COL_MIGRATED, + COL_MOUNTPOINT +}; + +static const struct colinfo infos[] = { + [COL_NAME] = { "NAME", 0.25, 0, N_("zram device name") }, + [COL_DISKSIZE] = { "DISKSIZE", 5, SCOLS_FL_RIGHT, N_("limit on the uncompressed amount of data") }, + [COL_ORIG_SIZE] = { "DATA", 5, SCOLS_FL_RIGHT, N_("uncompressed size of stored data") }, + [COL_COMP_SIZE] = { "COMPR", 5, SCOLS_FL_RIGHT, N_("compressed size of stored data") }, + [COL_ALGORITHM] = { "ALGORITHM", 3, 0, N_("the selected compression algorithm") }, + [COL_STREAMS] = { "STREAMS", 3, SCOLS_FL_RIGHT, N_("number of concurrent compress operations") }, + [COL_ZEROPAGES] = { "ZERO-PAGES", 3, SCOLS_FL_RIGHT, N_("empty pages with no allocated memory") }, + [COL_MEMTOTAL] = { "TOTAL", 5, SCOLS_FL_RIGHT, N_("all memory including allocator fragmentation and metadata overhead") }, + [COL_MEMLIMIT] = { "MEM-LIMIT", 5, SCOLS_FL_RIGHT, N_("memory limit used to store compressed data") }, + [COL_MEMUSED] = { "MEM-USED", 5, SCOLS_FL_RIGHT, N_("memory zram have been consumed to store compressed data") }, + [COL_MIGRATED] = { "MIGRATED", 5, SCOLS_FL_RIGHT, N_("number of objects migrated by compaction") }, + [COL_MOUNTPOINT]= { "MOUNTPOINT",0.10, SCOLS_FL_TRUNC, N_("where the device is mounted") }, +}; + +static int columns[ARRAY_SIZE(infos) * 2] = {-1}; +static int ncolumns; + +enum { + MM_ORIG_DATA_SIZE = 0, + MM_COMPR_DATA_SIZE, + MM_MEM_USED_TOTAL, + MM_MEM_LIMIT, + MM_MEM_USED_MAX, + MM_ZERO_PAGES, + MM_NUM_MIGRATED +}; + +static const char *mm_stat_names[] = { + [MM_ORIG_DATA_SIZE] = "orig_data_size", + [MM_COMPR_DATA_SIZE] = "compr_data_size", + [MM_MEM_USED_TOTAL] = "mem_used_total", + [MM_MEM_LIMIT] = "mem_limit", + [MM_MEM_USED_MAX] = "mem_used_max", + [MM_ZERO_PAGES] = "zero_pages", + [MM_NUM_MIGRATED] = "num_migrated" +}; + +struct zram { + char devname[32]; + struct path_cxt *sysfs; /* device specific sysfs directory */ + char **mm_stat; + + unsigned int mm_stat_probed : 1, + control_probed : 1, + has_control : 1; /* has /sys/class/zram-control/ */ +}; + +static unsigned int raw, no_headings, inbytes; +static struct path_cxt *__control; + +static int get_column_id(int num) +{ + assert(num < ncolumns); + assert(columns[num] < (int) ARRAY_SIZE(infos)); + return columns[num]; +} + +static const struct colinfo *get_column_info(int num) +{ + return &infos[ get_column_id(num) ]; +} + +static int column_name_to_id(const char *name, size_t namesz) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(infos); i++) { + const char *cn = infos[i].name; + + if (!strncasecmp(name, cn, namesz) && !*(cn + namesz)) + return i; + } + warnx(_("unknown column: %s"), name); + return -1; +} + +static void zram_reset_stat(struct zram *z) +{ + if (z) { + strv_free(z->mm_stat); + z->mm_stat = NULL; + z->mm_stat_probed = 0; + } +} + +static void zram_set_devname(struct zram *z, const char *devname, size_t n) +{ + assert(z); + + if (!devname) + snprintf(z->devname, sizeof(z->devname), "/dev/zram%zu", n); + else { + strncpy(z->devname, devname, sizeof(z->devname)); + z->devname[sizeof(z->devname) - 1] = '\0'; + } + + DBG(fprintf(stderr, "set devname: %s", z->devname)); + ul_unref_path(z->sysfs); + z->sysfs = NULL; + zram_reset_stat(z); +} + +static int zram_get_devnum(struct zram *z) +{ + int n; + + assert(z); + + if (sscanf(z->devname, "/dev/zram%d", &n) == 1) + return n; + return -EINVAL; +} + +static struct zram *new_zram(const char *devname) +{ + struct zram *z = xcalloc(1, sizeof(struct zram)); + + DBG(fprintf(stderr, "new: %p", z)); + if (devname) + zram_set_devname(z, devname, 0); + return z; +} + +static void free_zram(struct zram *z) +{ + if (!z) + return; + DBG(fprintf(stderr, "free: %p", z)); + ul_unref_path(z->sysfs); + zram_reset_stat(z); + free(z); +} + +static struct path_cxt *zram_get_sysfs(struct zram *z) +{ + assert(z); + + if (!z->sysfs) { + dev_t devno = sysfs_devname_to_devno(z->devname); + if (!devno) + return NULL; + z->sysfs = ul_new_sysfs_path(devno, NULL, NULL); + if (!z->sysfs) + return NULL; + if (*z->devname != '/') + /* canonicalize the device name according to /sys */ + sysfs_blkdev_get_path(z->sysfs, z->devname, sizeof(z->devname)); + } + + return z->sysfs; +} + +static inline int zram_exist(struct zram *z) +{ + assert(z); + + errno = 0; + if (zram_get_sysfs(z) == NULL) { + errno = ENODEV; + return 0; + } + + DBG(fprintf(stderr, "%s exists", z->devname)); + return 1; +} + +static int zram_set_u64parm(struct zram *z, const char *attr, uint64_t num) +{ + struct path_cxt *sysfs = zram_get_sysfs(z); + if (!sysfs) + return -EINVAL; + DBG(fprintf(stderr, "%s writing %ju to %s", z->devname, num, attr)); + return ul_path_write_u64(sysfs, num, attr); +} + +static int zram_set_strparm(struct zram *z, const char *attr, const char *str) +{ + struct path_cxt *sysfs = zram_get_sysfs(z); + if (!sysfs) + return -EINVAL; + DBG(fprintf(stderr, "%s writing %s to %s", z->devname, str, attr)); + return ul_path_write_string(sysfs, str, attr); +} + + +static int zram_used(struct zram *z) +{ + uint64_t size; + struct path_cxt *sysfs = zram_get_sysfs(z); + + if (sysfs && + ul_path_read_u64(sysfs, &size, "disksize") == 0 && + size > 0) { + + DBG(fprintf(stderr, "%s used", z->devname)); + return 1; + } + DBG(fprintf(stderr, "%s unused", z->devname)); + return 0; +} + +static int zram_has_control(struct zram *z) +{ + if (!z->control_probed) { + z->has_control = access(_PATH_SYS_CLASS "/zram-control/", F_OK) == 0 ? 1 : 0; + z->control_probed = 1; + DBG(fprintf(stderr, "zram-control: %s", z->has_control ? "yes" : "no")); + } + + return z->has_control; +} + +static struct path_cxt *zram_get_control(void) +{ + if (!__control) + __control = ul_new_path(_PATH_SYS_CLASS "/zram-control"); + return __control; +} + +static int zram_control_add(struct zram *z) +{ + int n; + struct path_cxt *ctl; + + if (!zram_has_control(z) || !(ctl = zram_get_control())) + return -ENOSYS; + + if (ul_path_read_s32(ctl, &n, "hot_add") != 0 || n < 0) + return n; + + DBG(fprintf(stderr, "hot-add: %d", n)); + zram_set_devname(z, NULL, n); + return 0; +} + +static int zram_control_remove(struct zram *z) +{ + struct path_cxt *ctl; + int n; + + if (!zram_has_control(z) || !(ctl = zram_get_control())) + return -ENOSYS; + + n = zram_get_devnum(z); + if (n < 0) + return n; + + DBG(fprintf(stderr, "hot-remove: %d", n)); + return ul_path_write_u64(ctl, n, "hot_remove"); +} + +static struct zram *find_free_zram(void) +{ + struct zram *z = new_zram(NULL); + size_t i; + int isfree = 0; + + for (i = 0; isfree == 0; i++) { + DBG(fprintf(stderr, "find free: checking zram%zu", i)); + zram_set_devname(z, NULL, i); + if (!zram_exist(z) && zram_control_add(z) != 0) + break; + isfree = !zram_used(z); + } + if (!isfree) { + free_zram(z); + z = NULL; + } + return z; +} + +static char *get_mm_stat(struct zram *z, size_t idx, int bytes) +{ + struct path_cxt *sysfs; + const char *name; + char *str = NULL; + uint64_t num; + + assert(idx < ARRAY_SIZE(mm_stat_names)); + assert(z); + + sysfs = zram_get_sysfs(z); + if (!sysfs) + return NULL; + + /* Linux >= 4.1 uses /sys/block/zram<id>/mm_stat */ + if (!z->mm_stat && !z->mm_stat_probed) { + if (ul_path_read_string(sysfs, &str, "mm_stat") > 0 && str) { + z->mm_stat = strv_split(str, " "); + + /* make sure kernel provides mm_stat as expected */ + if (strv_length(z->mm_stat) < ARRAY_SIZE(mm_stat_names)) { + strv_free(z->mm_stat); + z->mm_stat = NULL; + } + } + z->mm_stat_probed = 1; + free(str); + str = NULL; + } + + if (z->mm_stat) { + if (bytes) + return xstrdup(z->mm_stat[idx]); + + num = strtou64_or_err(z->mm_stat[idx], _("Failed to parse mm_stat")); + return size_to_human_string(SIZE_SUFFIX_1LETTER, num); + } + + /* Linux < 4.1 uses /sys/block/zram<id>/<attrname> */ + name = mm_stat_names[idx]; + if (bytes) { + ul_path_read_string(sysfs, &str, name); + return str; + + } else if (ul_path_read_u64(sysfs, &num, name) == 0) + return size_to_human_string(SIZE_SUFFIX_1LETTER, num); + + return NULL; +} + +static void fill_table_row(struct libscols_table *tb, struct zram *z) +{ + static struct libscols_line *ln; + struct path_cxt *sysfs; + size_t i; + uint64_t num; + + assert(tb); + assert(z); + + DBG(fprintf(stderr, "%s: filling status table", z->devname)); + + sysfs = zram_get_sysfs(z); + if (!sysfs) + return; + + ln = scols_table_new_line(tb, NULL); + if (!ln) + err(EXIT_FAILURE, _("failed to allocate output line")); + + for (i = 0; i < (size_t) ncolumns; i++) { + char *str = NULL; + + switch (get_column_id(i)) { + case COL_NAME: + str = xstrdup(z->devname); + break; + case COL_DISKSIZE: + if (inbytes) + ul_path_read_string(sysfs, &str, "disksize"); + + else if (ul_path_read_u64(sysfs, &num, "disksize") == 0) + str = size_to_human_string(SIZE_SUFFIX_1LETTER, num); + break; + case COL_ALGORITHM: + { + char *alg = NULL; + + ul_path_read_string(sysfs, &alg, "comp_algorithm"); + if (alg) { + char* lbr = strrchr(alg, '['); + char* rbr = strrchr(alg, ']'); + + if (lbr != NULL && rbr != NULL && rbr - lbr > 1) + str = xstrndup(lbr + 1, rbr - lbr - 1); + free(alg); + } + break; + } + case COL_MOUNTPOINT: + { + char path[PATH_MAX] = { '\0' }; + int fl; + + check_mount_point(z->devname, &fl, path, sizeof(path)); + if (*path) + str = xstrdup(path); + break; + } + case COL_STREAMS: + ul_path_read_string(sysfs, &str, "max_comp_streams"); + break; + case COL_ZEROPAGES: + str = get_mm_stat(z, MM_ZERO_PAGES, 1); + break; + case COL_ORIG_SIZE: + str = get_mm_stat(z, MM_ORIG_DATA_SIZE, inbytes); + break; + case COL_COMP_SIZE: + str = get_mm_stat(z, MM_COMPR_DATA_SIZE, inbytes); + break; + case COL_MEMTOTAL: + str = get_mm_stat(z, MM_MEM_USED_TOTAL, inbytes); + break; + case COL_MEMLIMIT: + str = get_mm_stat(z, MM_MEM_LIMIT, inbytes); + break; + case COL_MEMUSED: + str = get_mm_stat(z, MM_MEM_USED_MAX, inbytes); + break; + case COL_MIGRATED: + str = get_mm_stat(z, MM_NUM_MIGRATED, inbytes); + break; + } + if (str && scols_line_refer_data(ln, i, str)) + err(EXIT_FAILURE, _("failed to add output data")); + } +} + +static void status(struct zram *z) +{ + struct libscols_table *tb; + size_t i; + DIR *dir; + struct dirent *d; + + scols_init_debug(0); + + tb = scols_new_table(); + if (!tb) + err(EXIT_FAILURE, _("failed to allocate output table")); + + scols_table_enable_raw(tb, raw); + scols_table_enable_noheadings(tb, no_headings); + + for (i = 0; i < (size_t) ncolumns; i++) { + const struct colinfo *col = get_column_info(i); + + if (!scols_table_new_column(tb, col->name, col->whint, col->flags)) + err(EXIT_FAILURE, _("failed to initialize output column")); + } + + if (z) { + /* just one device specified */ + fill_table_row(tb, z); + goto print_table; + } + + /* list all used devices */ + z = new_zram(NULL); + if (!(dir = opendir(_PATH_DEV))) + err(EXIT_FAILURE, _("cannot open %s"), _PATH_DEV); + + while ((d = readdir(dir))) { + int n; + if (sscanf(d->d_name, "zram%d", &n) != 1) + continue; + zram_set_devname(z, NULL, n); + if (zram_exist(z) && zram_used(z)) + fill_table_row(tb, z); + } + closedir(dir); + free_zram(z); + +print_table: + scols_print_table(tb); + scols_unref_table(tb); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + size_t i; + + fputs(USAGE_HEADER, out); + fprintf(out, _( " %1$s [options] <device>\n" + " %1$s -r <device> [...]\n" + " %1$s [options] -f | <device> -s <size>\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Set up and control zram devices.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -a, --algorithm lzo|lz4|lz4hc|deflate|842 compression algorithm to use\n"), out); + fputs(_(" -b, --bytes print sizes in bytes rather than in human readable format\n"), out); + fputs(_(" -f, --find find a free device\n"), out); + fputs(_(" -n, --noheadings don't print headings\n"), out); + fputs(_(" -o, --output <list> columns to use for status output\n"), out); + fputs(_(" --output-all output all columns\n"), out); + fputs(_(" --raw use raw status output format\n"), out); + fputs(_(" -r, --reset reset all specified devices\n"), out); + fputs(_(" -s, --size <size> device size\n"), out); + fputs(_(" -t, --streams <number> number of compression streams\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(27)); + + fputs(USAGE_COLUMNS, out); + for (i = 0; i < ARRAY_SIZE(infos); i++) + fprintf(out, " %11s %s\n", infos[i].name, _(infos[i].help)); + + printf(USAGE_MAN_TAIL("zramctl(8)")); + exit(EXIT_SUCCESS); +} + +/* actions */ +enum { + A_NONE = 0, + A_STATUS, + A_CREATE, + A_FINDONLY, + A_RESET +}; + +int main(int argc, char **argv) +{ + uintmax_t size = 0, nstreams = 0; + char *algorithm = NULL; + int rc = 0, c, find = 0, act = A_NONE; + struct zram *zram = NULL; + + enum { + OPT_RAW = CHAR_MAX + 1, + OPT_LIST_TYPES + }; + + static const struct option longopts[] = { + { "algorithm", required_argument, NULL, 'a' }, + { "bytes", no_argument, NULL, 'b' }, + { "find", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { "output", required_argument, NULL, 'o' }, + { "output-all",no_argument, NULL, OPT_LIST_TYPES }, + { "noheadings",no_argument, NULL, 'n' }, + { "reset", no_argument, NULL, 'r' }, + { "raw", no_argument, NULL, OPT_RAW }, + { "size", required_argument, NULL, 's' }, + { "streams", required_argument, NULL, 't' }, + { "version", no_argument, NULL, 'V' }, + { NULL, 0, NULL, 0 } + }; + + static const ul_excl_t excl[] = { + { 'f', 'o', 'r' }, + { 'o', 'r', 's' }, + { 0 } + }; + int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + atexit(close_stdout); + + while ((c = getopt_long(argc, argv, "a:bfho:nrs:t:V", longopts, NULL)) != -1) { + + err_exclusive_options(c, longopts, excl, excl_st); + + switch (c) { + case 'a': + algorithm = optarg; + break; + case 'b': + inbytes = 1; + break; + case 'f': + find = 1; + break; + case 'o': + ncolumns = string_to_idarray(optarg, + columns, ARRAY_SIZE(columns), + column_name_to_id); + if (ncolumns < 0) + return EXIT_FAILURE; + break; + case OPT_LIST_TYPES: + for (ncolumns = 0; (size_t)ncolumns < ARRAY_SIZE(infos); ncolumns++) + columns[ncolumns] = ncolumns; + break; + case 's': + size = strtosize_or_err(optarg, _("failed to parse size")); + act = A_CREATE; + break; + case 't': + nstreams = strtou64_or_err(optarg, _("failed to parse streams")); + break; + case 'r': + act = A_RESET; + break; + case OPT_RAW: + raw = 1; + break; + case 'n': + no_headings = 1; + break; + case 'V': + printf(UTIL_LINUX_VERSION); + return EXIT_SUCCESS; + case 'h': + usage(); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if (find && optind < argc) + errx(EXIT_FAILURE, _("option --find is mutually exclusive " + "with <device>")); + if (act == A_NONE) + act = find ? A_FINDONLY : A_STATUS; + + if (act != A_RESET && optind + 1 < argc) + errx(EXIT_FAILURE, _("only one <device> at a time is allowed")); + + if ((act == A_STATUS || act == A_FINDONLY) && (algorithm || nstreams)) + errx(EXIT_FAILURE, _("options --algorithm and --streams " + "must be combined with --size")); + + ul_path_init_debug(); + ul_sysfs_init_debug(); + + switch (act) { + case A_STATUS: + if (!ncolumns) { /* default columns */ + columns[ncolumns++] = COL_NAME; + columns[ncolumns++] = COL_ALGORITHM; + columns[ncolumns++] = COL_DISKSIZE; + columns[ncolumns++] = COL_ORIG_SIZE; + columns[ncolumns++] = COL_COMP_SIZE; + columns[ncolumns++] = COL_MEMTOTAL; + columns[ncolumns++] = COL_STREAMS; + columns[ncolumns++] = COL_MOUNTPOINT; + } + if (optind < argc) { + zram = new_zram(argv[optind++]); + if (!zram_exist(zram)) + err(EXIT_FAILURE, "%s", zram->devname); + } + status(zram); + free_zram(zram); + break; + case A_RESET: + if (optind == argc) + errx(EXIT_FAILURE, _("no device specified")); + while (optind < argc) { + zram = new_zram(argv[optind]); + if (!zram_exist(zram) + || zram_set_u64parm(zram, "reset", 1)) { + warn(_("%s: failed to reset"), zram->devname); + rc = 1; + } + zram_control_remove(zram); + free_zram(zram); + optind++; + } + break; + case A_FINDONLY: + zram = find_free_zram(); + if (!zram) + errx(EXIT_FAILURE, _("no free zram device found")); + printf("%s\n", zram->devname); + free_zram(zram); + break; + case A_CREATE: + if (find) { + zram = find_free_zram(); + if (!zram) + errx(EXIT_FAILURE, _("no free zram device found")); + } else if (optind == argc) + errx(EXIT_FAILURE, _("no device specified")); + else { + zram = new_zram(argv[optind]); + if (!zram_exist(zram)) + err(EXIT_FAILURE, "%s", zram->devname); + } + + if (zram_set_u64parm(zram, "reset", 1)) + err(EXIT_FAILURE, _("%s: failed to reset"), zram->devname); + + if (nstreams && + zram_set_u64parm(zram, "max_comp_streams", nstreams)) + err(EXIT_FAILURE, _("%s: failed to set number of streams"), zram->devname); + + if (algorithm && + zram_set_strparm(zram, "comp_algorithm", algorithm)) + err(EXIT_FAILURE, _("%s: failed to set algorithm"), zram->devname); + + if (zram_set_u64parm(zram, "disksize", size)) + err(EXIT_FAILURE, _("%s: failed to set disksize (%ju bytes)"), + zram->devname, size); + if (find) + printf("%s\n", zram->devname); + free_zram(zram); + break; + } + + ul_unref_path(__control); + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} |