Linux

Common checks

Monitoring

# See CPU + RAM usage, system stats and open processes
top

# Only list processes making active use of the CPU
top -i

# Only list processes making active use of the CPU, and include the entire command being instead of just the tool-name
top -ci

# Prettier version of top that can be customized
htop

# Reimagined version of top, includes network and disk usage by default
btop

Systemd

# Open journalctl at the beginning
journalctl -b

# Open journalctl at the end
journalctl -e

# Open journalctl but include service information
journalctl -x

# Show journalctl logs for the sshd service, starting from the end
journalctl -u sshd -e

# Output contents directly to the shell
journalctl --no-pager

OS & Distribution

# Print OS and host information
hostnamectl

# Show OS and distribution information
cat /proc/version

# Show OS and distribution information
cat /etc/os-release

# Print distribution-specific information
lsb_release -a

Hardware & kernel

# List installed kernel modules
lsmod

# Print Kernel messages
dmesg

# Print Kernel messages with humanized timestamps
dmesg -T

# SCSI hardware information
cat /proc/scsi/scsi

# Print hardware/BIOS information
dmidecode 

# Print hardware/BIOS information of a specific type
dmidecode -t 1

# List all connected hardware
lshw

# List physical network hardware
lshw -short -class network

# List physical memory hardware
lshw -class memory

# Show PCI information
lspci

# Show verbose PCI information
lspci -v

# List all block/filesystem devices
lsblk

# List block devices and partition tables
fdisk -l

Filesystems

# List clients connected to the local filesystem
showmount

SMB/CIFS

# Samba checks
smbstatus
smbstatus -S
smbstatus -b

# Samba set debug mode
smbcontrol smbd debug 1

NFS

https://www.ibm.com/docs/en/aix/7.2?topic=troubleshooting-identifying-nfs-problems

# NFS 
nfsstat

# Detailed RPC and package information
nfsstat -o all

# Every RPC "program" is bound to a specific NFS version. Use NFS/CTDB logs in combination with the program ID to identify the failing component
rpcinfo -p

UFW

# Show summary of UFW status
ufw status

# Show verbose UFW status
ufw status verbose

# Show UFW rules numbered
ufw status numbered

CTDB

# CTDB checks in a cluster
ctdb status
ctdb ip
ctdb scriptstatus
ctdb event status
ctdb uptime
ctdb statistics

# Use the onnode command to execute a command on all cluster nodes
onnode all ctdb status

Pacemaker

# Show status of the pacemaker cluster
pcs cluster status

# Show status of the pacemaker service
pcs status

# Show configured pacemaker resources
pcs resource config

# Show a specific configured resource
pcs resource show ResourceNameHere

Services

NTP

Timedatectl

# Show the current status of timedatectl
timedatectl

# List available timezones
timedatectl list-timezones

# Set the timezone to Amsterdam
timedatectl set-timezone Europe/Amsterdam

# Show verbose sync information
timedatectl timesync-status

SNMPv3 client installation

https://kifarunix.com/quick-way-to-install-and-configure-snmp-on-ubuntu-20-04/

apt install snmpd snmp libsnmp-dev
cp /etc/snmp/snmpd.conf /etc/snmp/snmpd.conf.bak
systemctl stop snmpd
net-snmp-create-v3-user -ro -X <CRYPTO-PASSWORD> -a SHA -X <PASSWORD> -x AES <USERNAME>

# /etc/snmp/snmpd.conf
sysLocation    NL;Zuid-Holland;Rotterdam, 78 MyStreet;2nd Floor;Server Room;Rack
sysContact     Me <me@example.org>
agentaddress   192.168.0.10

systemctl start snmpd
systemctl enable snmpd

# Test
snmpwalk -v3 -a SHA -A "CRYPTO" -x AES -X "PASSWORD" -l authPriv -u "USER" localhost | head

CTDB

# Stop a ctdb cluster member
ctdb stop

# Start a stopped ctdb cluster member
ctdb continue

Firewalls

UFW

# Allow access from a specific IP to a port and add a comment that show in the status
ufw allow from 10.0.0.253 to any port 22 proto tcp comment 'Allow SSH access from XYZ location'

# Delete numbered Firewall rule 56
ufw delete 56

# Disable UFW logging (prevent syslog spam)
ufw logging off

# Set UFW logging back to the default
ufw logging low

Firewalld

#placeholder
to fill in later

SNMP access

https://unix.stackexchange.com/questions/214388/how-to-let-the-firewall-of-rhel7-the-snmp-connection-passing

vim /etc/firewalld/services/snmp.xml

<?xml version="1.0" encoding="utf-8"?>
<service>
  <short>SNMP</short>
  <description>SNMP protocol</description>
  <port protocol="udp" port="161"/>
</service>

firewall-cmd --reload
firewall-cmd --zone=public --add-service snmp --permanent
firewall-cmd --reload

Syslog

Legacy

#/etc/rsyslog.d/70-local-to-rsyslog-server.conf
# Define the hostname to send to the syslog server
$template SendHostname, "<%pri%> %timestamp% myhost.mydomain.nl %syslogtag% %msg%\n"
$ActionForwardDefaultTemplate SendHostname

*.warning @10.77.0.1

Rainerscript

Rainerscript: https://rsyslog.readthedocs.io/en/latest/rainerscript/control_structures.html

# /etc/rsyslog.d/70-local-to-rsyslog-server.conf
# Define a template and specify a hostname to send as:
template(name="SendHostname" type="string"
string="%timestamp% myhost.mydomain.nl %syslogtag% %msg%\n"
)

# Send logs to target syslog server and port
*.warning action(type="omfwd" Target="10.0.33.10" Template="SendHostname" Port="514" Protocol="udp")

Testing

# Use the logger tool to test syslog server reception
logger -p local0.error 'Hello World!'

Ceph

https://sabaini.at/pages/ceph-cheatsheet.html

Checks

# Display the running Ceph version
ceph -v

# Check the clusters' health and status
ceph -s

# Watch the clusters' health and status in real time
ceph -w

# Show detailed logs relating to cluster health
ceph health detail

# List all Ceph 'containers' and OSDs
ceph orch ls

# List available storage devices
ceph orch device ls

# Show logs for a specific service
ceph orch ls --service_name osd.all-available-devices --format yaml

# Re-check the status of a host
ceph cephadm check-host schijf-3

# List all pools
ceph osd lspools

# See the status of all OSDs
ceph osd stat

# List all OSDs
ceph osd tree

# List all Placement Groups
ceph pg dump

# Check the status of Ceph PGs
ceph pg stat

Commands

</syntaxhighlight>

Enter the Ceph shell (single cluster)

cephadm shell </syntaxhighlight>

Installation

Using Cephadm

Cephadm

# Create a folder for the cephadm tool
mkdir cephadm
cd cephadm/

# Download cephadm (Quincy)
curl --silent --remote-name --location https://github.com/ceph/ceph/raw/quincy/src/cephadm/cephadm
chmod +x cephadm

# Output help
./cephadm -h

# Install cephadm (Quincy) release
./cephadm add-repo --release quincy
./cephadm install

# Check if cephadm is properly installed
which cephadm

Bootstrap

# Bootstrap node and install Ceph
cephadm bootstrap --mon-ip 192.168.100.11

# Check the status of the cluster
cephadm shell -- ceph -s
docker ps


## Optional
# Enter the Ceph shell (single cluster)
cephadm shell

# Exit the Ceph shell
exit

# Install common Ceph packages/tools 
cephadm install ceph-common

# Display the Ceph version
ceph -v
 ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable)

Add additional hosts

# On your bootstrapped node create a key for SSH-access to the other hosts.
ssh-keygen
cat .ssh/id_rsa.pub

# Add the newly generated key to the authorized_keys file for the relevant user, on the other hosts.

# Copy the Ceph clusters' public key to the other nodes
ssh-copy-id -f -i /etc/ceph/ceph.pub root@storage-2
ssh-copy-id -f -i /etc/ceph/ceph.pub root@storage-3

# Add the admin role to the other nodes
ceph orch host add storage-2 10.4.20.2 _admin
ceph orch host add storage-3 10.4.20.3 _admin

OSD creation

https://github.com/rook/rook/issues/7519

If you've installed ceph-osd on your host, this step will fail horribly with errors such as:

-1 bluestore(/var/lib/ceph/osd/ceph-1//block) _read_bdev_label failed to open /var/lib/ceph/osd/ceph-1//block: (13) Permission denied
-1 bdev(0x5571d5f69400 /var/lib/ceph/osd/ceph-1//block) open open got: (13) Permission denied
-1 OSD::mkfs: ObjectStore::mkfs failed with error (13) Permission denied
-1 ESC[0;31m ** ERROR: error creating empty object store in /var/lib/ceph/osd/ceph-0/: (13) Permission deniedESC[0m
 OSD, will rollback changes

# Configure all available storage to be used as OSD storage
ceph orch apply osd --all-available-devices

# Check for OSD problems
watch ceph -s
watch ceph osd tree

Commands

# Enter the Ceph shell for a specific cluster
sudo /usr/sbin/cephadm shell --fsid asdjwqe-asjd324-asdki321-821asd-asd241-asdn1234- -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin2.keyring

# Give node storage-4, which is already a cluster member, the admin tag
ceph orch host label add storage-4 _admin

# Mount a Ceph filesystem with 3 mon hosts, using a secretfile 
# Contents in the secretfile is ONLY the secret / key
mount -t ceph 192.168.0.11,192.168.0.12,192.168.0.13:/shares/mycustomer/asd8asd8-as8d83-df4mjvjdf /mnt/ceph-storage -o name=customershare-28,secretfile=/etc/ceph/customer28-secretfile

Upgrade

Make sure your cluster status is healthy first!

# Upgrade Ceph to a specific version
ceph orch upgrade start --ceph-version 17.2.0

# Check the status of the Ceph upgrade
ceph orch upgrade status

# Stop the Ceph upgrade
ceph orch upgrade stop

RBD-NBD

# List available volumes within the openstackvolumes pool
rbd ls openstackhdd

# List all available snapshots for object volume-asd9p12o3-90b2-1238-1209-as980d7213hs, which resides in pool ghgvolumes
rbd snap ls openstackhdd/volume-asd9p12o3-90b2-1238-1209-as980d7213hs

# Map the volume-object to the local filesystem
rbd-nbd map openstackhdd/volume-asd9p12o3-90b2-1238-1209-as980d7213hs

# Map the volume-object as read-only to the local filesystem
rbd-nbd map --read-only openstackhdd/volume-asd9p12o3-90b2-1238-1209-as980d7213hs

# List currently mapped objects
rbd-nbd list-mapped

# Check what filesystem and partition the device contains
fdisk -l /dev/nbd1

# Mount the device to a local folder
mount /dev/nbd1p1 /mnt/storage

# Unmount the device from the local folder
umount /mnt/storage


# 2 methods to unmap
# Unmap the mapped object
rbd-nbd unmap /dev/nbd2

# Unmap the mapped object
rbd-nbd unmap volume-asd9p12o3-90b2-1238-1209-as980d7213hs

Remove node

# Remove running daemons
ceph orch host drain storage-3

# Remove host from the cluster
ceph orch host rm storage-3

# In storage-3, restart the node and restart
shutdown -r now

Destroy node

Scorched earth
Only execute if you want to destroy your cluster.

# Kill and destroy OSD 0
ceph osd down 0 && ceph osd destroy 0 --force

# Stop Ceph services
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mon.host-1.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@crash.host-1.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mgr.host-1.xmatqa.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mon.host-1.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@node-exporter.host-1.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn@prometheus.host-1.service
systemctl stop ceph-asd82asd-asd8-as92-a889-po89xc732cmn.target

# Disable Ceph services
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mon.host-1.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@crash.host-1.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mgr.host-1.xmatqa.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@mon.host-1.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@node-exporter.host-1.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn@prometheus.host-1.service
systemctl disable ceph-asd82asd-asd8-as92-a889-po89xc732cmn.target

# Destroy everything (packages, containers, configuration)
ceph-deploy uninstall host-1
ceph-deploy purge host-1
rm -rf /var/lib/ceph

# Check for failed services
systemctl | grep ceph

# Reset them so they disable properly
systemctl reset-failed ceph-asd82asd-asd8-as92-a889-po89xc732cmn@prometheus.host-1.service

# reboot
shutdown -r now

User management

Create user (RHEL)

# Create user with a home-folder and add him to the wheel group
useradd -m john -G wheel
 
# Set a password for the john user
passwd john

# Create the SSH folder for john
mkdir -p /home/john/.ssh

# Add a public key to john's account
echo "ssh rsa-123980idfas89132hadsckjh871234" >> /home/john/.ssh/authorized_keys

# Set proper permissions for the .ssh folder and authorized_keys
chown -R john:john /home/john/.ssh
chmod 700 /home/john/.ssh
chmod 600 /home/john/.ssh/authorized_keys

Group management

https://www.networkworld.com/article/3237946/building-command-groups-with-sudo.html

Allow user "chris" to perform the 2 given commands with sudo, no password.

# /etc/sudoers
# User alias specification
chris    ALL=(ALL) NOPASSWD: UPDATE_CMDS

# Cmnd alias specification
Cmnd_Alias UPDATE_CMDS = /usr/bin/apt-get update, /usr/bin/apt-get upgrade

Allow members of the group "researchers" to perform the 2 given commands with sudo, no password.

# /etc/sudoers
# User alias specification
%researchers    ALL=(ALL) NOPASSWD: UPDATE_CMDS

# Cmnd alias specification
Cmnd_Alias UPDATE_CMDS = /usr/bin/apt-get update, /usr/bin/apt-get upgrade

Other

Throughput test

# Test bandwidth throughput with iperf
# Listen on server-A on port 5101
iperf3 -s -p 5101

# Connect to server-A from server-B
iperf3 -c 192.168.0.1 -p 5101

# Testing disk/share throughput
# Create "testfile" of size 1710x1M in current folder
time dd if=/dev/zero of=testfile bs=1M count=1710

# Create "'testfile2" of size 5x1G in current folder
time dd if=/dev/zero of=testfile2 bs=1G count=5

# Show copy-time of "testfile" to disk or share
time cp testfile /mnt/btfrs/data/<LOCATION>/

# Methods of testing disk or share throughput
# show read-time from the mount to null
time cat /mnt/btfrs/data/<FILE> > /dev/null

# show copy-time from the mount to null
time dd if=/mnt/btfrs/data/<FILE> of=/dev/null bs=1M

# show copy-time from the mount to the current folder
time cp /mnt/btfrs/data/<FILE> .

# Copy one folder to another with rsync while showing progress
rsync -avhW --no-compress --progress <source>/ <destination>/

Create different temp folder

# Create a temporary TMP folder
mkdir -p /scratch/tmp/

# Activate temporary TMP folder
export TMPDIR=/scratch/tmp

Inodes

Every file on a filesystem has a corresponding inode file.
An inode file of a folder carries names of the files and folders that exist within itself.
The name of each filesystem-object refers to an inode file.
The inode file of a file contains a kind of metadata about the file, such as the type, Inode ID, timestamp, filename and size.

Links

Linux

Common checks

Monitoring

Systemd

OS & Distribution

Hardware & kernel

Filesystems

SMB/CIFS

NFS

UFW

CTDB

Pacemaker

Services

NTP

Timedatectl

SNMPv3 client installation

CTDB

Firewalls

UFW

Firewalld

SNMP access

Syslog

Legacy

Rainerscript

Testing

Ceph

Checks

Commands

Installation

Cephadm

Bootstrap

Add additional hosts

OSD creation

Commands

Upgrade

RBD-NBD

Remove node

Destroy node

User management

Create user (RHEL)

Group management

Other

Throughput test

Create different temp folder

Inodes

Links

Navigation menu

Search