[PATCH][CFT] netfilter hook statistics, third take

Patrick Schaaf bof@bof.de
Mon, 29 Jul 2002 09:50:48 +0200


--DocE+STaALJfprDB
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Mainly commenting and cleanup; now has Configure.help. Functionally unchanged.


--DocE+STaALJfprDB
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="bof-nf-hookstat-20020729.patch"

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.654   -> 1.658  
#	       net/Config.in	1.10    -> 1.11   
#	net/core/netfilter.c	1.7     -> 1.11   
#	include/linux/netfilter.h	1.2     -> 1.3    
#	Documentation/Configure.help	1.110   -> 1.111  
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/07/27	bof@cdr.(none)	1.655
# Config.in, netfilter.h, netfilter.c:
#   netfilter hook statistics
# --------------------------------------------
# 02/07/28	bof@cdr.(none)	1.656
# net/core/netfilter.c:
#   slabify, make per-cpu counters.
# --------------------------------------------
# 02/07/28	bof@cdr.(none)	1.657
# net/core/netfilter.c:
#   remove debug printks related to slabifying hook statistic counters.
# --------------------------------------------
# 02/07/29	bof@cdr.(none)	1.658
# netfilter.c:
#   some more comments, minimal cleanup, KERN_NOTICE upon (un)registration.
# Configure.help:
#   friendly help and advise regarding CONFIG_NETFILTER_HOOK_STAT
# --------------------------------------------
#
diff -Nru a/Documentation/Configure.help b/Documentation/Configure.help
--- a/Documentation/Configure.help	Mon Jul 29 09:26:48 2002
+++ b/Documentation/Configure.help	Mon Jul 29 09:26:48 2002
@@ -2429,6 +2429,23 @@
   You can say Y here if you want to get additional messages useful in
   debugging the netfilter code.
 
+Netfilter hook statistics
+CONFIG_NETFILTER_HOOK_STAT
+  If you say Y here, the time spent in the various netfilter hook
+  functions is measured, using the TSC of your processor. Your
+  kernel won't boot when you don't have a working TSC.
+  Say N when you don't have a modern Intel/AMD processor.
+
+  When enabled, look at /proc/net/nf_stat_hook_* for the actual
+  measurement results, presented in a format easy to guess by
+  any well-calibrated crystal ball.
+
+  The timing imposes a processing overhead that may be relevant
+  on machines with high packet rates. The overhead is estimated
+  at about 5% of the time used by the hook functions, themselves.
+
+  The safe thing is to say N.
+
 Connection tracking (required for masq/NAT)
 CONFIG_IP_NF_CONNTRACK
   Connection tracking keeps a record of what packets have passed
diff -Nru a/include/linux/netfilter.h b/include/linux/netfilter.h
--- a/include/linux/netfilter.h	Mon Jul 29 09:26:48 2002
+++ b/include/linux/netfilter.h	Mon Jul 29 09:26:48 2002
@@ -51,6 +51,9 @@
 	int hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int priority;
+#ifdef CONFIG_NETFILTER_HOOK_STAT
+	void *hook_stat;
+#endif
 };
 
 struct nf_sockopt_ops
diff -Nru a/net/Config.in b/net/Config.in
--- a/net/Config.in	Mon Jul 29 09:26:48 2002
+++ b/net/Config.in	Mon Jul 29 09:26:48 2002
@@ -13,6 +13,7 @@
 bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER
 if [ "$CONFIG_NETFILTER" = "y" ]; then
    bool '  Network packet filtering debugging' CONFIG_NETFILTER_DEBUG
+   bool '  Netfilter hook statistics' CONFIG_NETFILTER_HOOK_STAT
 fi
 bool 'Socket Filtering'  CONFIG_FILTER
 tristate 'Unix domain sockets' CONFIG_UNIX
diff -Nru a/net/core/netfilter.c b/net/core/netfilter.c
--- a/net/core/netfilter.c	Mon Jul 29 09:26:48 2002
+++ b/net/core/netfilter.c	Mon Jul 29 09:26:48 2002
@@ -47,6 +47,304 @@
 struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
 static LIST_HEAD(nf_sockopts);
 
+#ifdef CONFIG_NETFILTER_HOOK_STAT
+
+/*
+ * menuconfig this under "Network options" >> "Netfilter hook statistics"
+ *
+ * The following code, up to the next #endif, implements per hook
+ * statistics counting. If enabled, look at /proc/net/nf_stat_hook*
+ * for the results.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <asm/msr.h>
+
+/*
+ * nf_stat_hook_proc[pf][hooknum] is a flag per protocol/hook, telling
+ * whether we have already created the /proc/net/nf_stat_hook_X.Y file.
+ * The array is only consulted during module registration. This code
+ * never removes the proc files; when all hook functions unregister,
+ * an empty file remains.
+ *
+ * Not used under normal per-packet processing.
+ */
+static unsigned char nf_stat_hook_proc[NPROTO][NF_MAX_HOOKS];
+
+/*
+ * struct nf_stat_hook_sample is used in nf_inject(), to record the
+ * beginning of the operation.	 After calling the hook function,
+ * it is reused to compute the duration of the hook function call,
+ * which is then recorded in nf_hook_ops->stat[percpu].
+ *
+ * CPU-local data on the stack, unshared.
+ */
+struct nf_stat_hook_sample {
+	unsigned long long stamp;
+};
+
+/*
+ * struct nf_stat_hook is our main statistics state structure.
+ * It is kept cache-aligned and per-cpu, summing the per-cpu
+ * values only when read through the /proc interface.
+ *
+ * CPU-local data, read across all CPUs only on user request.
+ * Updated locally on each CPU, one update per packet and hook function.
+ */
+struct nf_stat_hook {
+	unsigned long long count;
+	unsigned long long sum;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+/*
+ * The nf_stat_hook structures come from our private slab cache.
+ */
+static kmem_cache_t *nf_stat_hook_slab;
+
+/*
+ * nf_stat_hook_zero() is the slab ctor/dtor
+ */
+static void nf_stat_hook_zero(void *data, kmem_cache_t *slab, unsigned long x)
+{
+	struct nf_stat_hook *stat = data;
+	int i;
+
+	for (i=0; i<NR_CPUS; i++,stat++)
+		stat->count = stat->sum = 0;
+}
+
+/*
+ * nf_stat_hook_setup() is the one-time initialization routine.
+ * It allocates the slab cache for our statistics counters,
+ * and initializes the "proc registration" flag array.
+ */
+static void __init nf_stat_hook_setup(void)
+{
+	/* early rdtsc to catch booboo at boot time */
+	{ struct nf_stat_hook_sample sample; rdtscll(sample.stamp); }
+
+	nf_stat_hook_slab = kmem_cache_create("nf_stat_hook",
+				NR_CPUS * sizeof(struct nf_stat_hook),
+				0, SLAB_HWCACHE_ALIGN,
+				nf_stat_hook_zero, nf_stat_hook_zero);
+	if (!nf_stat_hook_slab)
+		printk(KERN_ERR "nf_stat_hook will NOT WORK - no slab.\n");
+
+	memset(nf_stat_hook_proc, 0, sizeof(nf_stat_hook_proc));
+}
+
+/*
+ * nf_stat_hook_read_proc() is a proc_fs read_proc() callback.
+ * Called per protocol/hook, the statistics of all netfilter
+ * hook elements sitting on that hook, are shown, in priority
+ * order. On SMP, the per-cpu counters are summed here.
+ * For accuracy, maybe we need to take some write lock. Later.
+ *
+ * Readings might look strange, until such locking is done.
+ * If you need to compensate, read several times, and throw
+ * out the strange results. Look for silly non-monotony.
+ *
+ * Output fields are seperated by a single blank, and represent:
+ * [0] address of 'struct nf_hook_ops'. (pointer, in unadorned 8-byte hex)
+ * [1] address of nf_hook_ops->hook() function pointer. When the
+ *     hook module is built into the kernel, you can find this
+ *     in System.map. (pointer, in unadorned 8-byte hex)
+ * [2] hook priority. (signed integer, in ascii)
+ * [3] number of times hook was called. (unsigned 64 bit integer, in ascii)
+ * [4] total number of cycles spent in the hook function, measured by
+ *     summing the rdtscll() differences across the calls. (unsigned
+ *     64 bit integer, in ascii)
+ *
+ * Additional fields may be added in the future; if any field is eventually
+ * retired, it will be set to neutral values: '00000000' for the pointer
+ * fields, and '0' for the integer fields. That's theory, not guarantee. :)
+ */
+static int nf_stat_hook_read_proc(
+	char *page,
+	char **start,
+	off_t off,
+	int count,
+	int *eof,
+	void *data
+) {
+	struct list_head *l;
+	int res;
+
+	for (	res = 0, l = ((struct list_head *)data)->next;
+		l != data;
+		l = l->next
+	) {
+		int i;
+		struct nf_hook_ops *elem = (struct nf_hook_ops *) l;
+		struct nf_stat_hook *stat = elem->hook_stat;
+
+		if (stat) {
+			unsigned long long count;
+			unsigned long long sum;
+			/* maybe write_lock something here */
+			for (i=0, count=0, sum=0; i<NR_CPUS; i++, stat++) {
+				count += stat->count;
+				sum += stat->sum;
+			}
+			/* and then write_unlock it here */
+			i = sprintf(page+res, "%p %p %d %Lu %Lu\n",
+					elem, elem->hook, elem->priority,
+					count, sum);
+		} else {
+			i = sprintf(page+res, "%p %p %d 0 0\n",
+					elem, elem->hook, elem->priority);
+		}
+		if (i <= 0)
+			break;
+		res += i;
+	}
+	return res;
+}
+
+/*
+ * nf_stat_hook_register() is called whenever a hook element registers.
+ * When neccessary, we create a /proc/net/nf_stat_hook* file here,
+ * and we always allocate one struct nf_stat_hook.
+ */
+static void nf_stat_hook_register(struct nf_hook_ops *elem)
+{
+	elem->hook_stat = (NULL == nf_stat_hook_slab)
+		? 0 : kmem_cache_alloc(nf_stat_hook_slab, SLAB_ATOMIC);
+	if (!elem->hook_stat) return;
+	if (!nf_stat_hook_proc[elem->pf][elem->hooknum]) {
+		char buf[64];
+		char hookname_buf[16];
+		char pfname_buf[16];
+		char *hookname;
+		char *pfname;
+		struct proc_dir_entry *proc;
+
+		switch(elem->pf) {
+			case 2:
+				pfname = "ipv4";
+				switch(elem->hooknum) {
+					case 0:
+						hookname = "PRE-ROUTING";
+						break;
+					case 1:
+						hookname = "LOCAL-IN";
+						break;
+					case 2:
+						hookname = "FORWARD";
+						break;
+					case 3:
+						hookname = "LOCAL-OUT";
+						break;
+					case 4:
+						hookname = "POST-ROUTING";
+						break;
+					default:
+						sprintf(hookname_buf, "hook%d",
+							elem->hooknum);
+						hookname = hookname_buf;
+						break;
+				}
+				break;
+			default:
+				sprintf(hookname_buf, "hook%d",
+					elem->hooknum);
+				hookname = hookname_buf;
+				sprintf(pfname_buf, "pf%d",
+					elem->pf);
+				pfname = pfname_buf;
+				break;
+		}
+		sprintf(buf, "net/nf_stat_hook_%s.%s", pfname, hookname);
+		proc = create_proc_read_entry(buf, 0644, NULL,
+			nf_stat_hook_read_proc,
+			&nf_hooks[elem->pf][elem->hooknum]
+		);
+		if (!proc) {
+			printk(KERN_ERR "cannot create %s\n", buf);
+			kmem_cache_free(nf_stat_hook_slab, elem->hook_stat);
+			elem->hook_stat = 0;
+			return;
+		}
+		proc->owner = THIS_MODULE;
+	}
+	nf_stat_hook_proc[elem->pf][elem->hooknum]++;
+	printk(KERN_NOTICE "nf_stat_hook %d/%d START %p [%d]\n",
+			elem->pf, elem->hooknum,
+			elem->hook,
+			nf_stat_hook_proc[elem->pf][elem->hooknum]);
+}
+
+/*
+ * nf_stat_hook_unregister() is called when a hook element unregisters.
+ * The statistics structure is freed, but we NEVER remove the /proc/net
+ * file entry. Maybe we should. nf_stat_hook_proc[][] contains the correct
+ * counter, I think (modulo races).
+ */
+static void nf_stat_hook_unregister(struct nf_hook_ops *elem)
+{
+	if (elem->hook_stat)
+		kmem_cache_free(nf_stat_hook_slab, elem->hook_stat);
+	nf_stat_hook_proc[elem->pf][elem->hooknum]--;
+	printk(KERN_NOTICE "nf_stat_hook %d/%d STOP %p [%d]\n",
+			elem->pf, elem->hooknum,
+			elem->hook,
+			nf_stat_hook_proc[elem->pf][elem->hooknum]);
+	if (nf_stat_hook_proc[elem->pf][elem->hooknum] == 0)
+		printk(KERN_NOTICE "nf_stat_hook %d/%d empty.\n",
+			elem->pf, elem->hooknum);
+}
+
+/*
+ * Finally, the next two functions implement the real timekeeping.
+ * If rdtscll() proves problematic, these have to be changed.
+ * The _begin() function is called before a specific hook entry
+ * function gets called - it starts the timer.
+ * The _end() function is called after the hook entry function,
+ * and it stops the timer, and remembers the interval in the
+ * statistics structure (per-cpu).
+ */
+
+static inline void nf_stat_hook_begin(struct nf_stat_hook_sample *sample)
+{
+	rdtscll(sample->stamp);
+}
+
+static inline void nf_stat_hook_end(
+	struct nf_stat_hook_sample *sample,
+	struct nf_hook_ops *elem,
+	int verdict
+) {
+	struct nf_stat_hook *stat = elem->hook_stat;
+	struct nf_stat_hook_sample now;
+	if (!stat) return;
+	rdtscll(now.stamp); now.stamp -= sample->stamp;
+	stat += smp_processor_id();
+	stat->count++;
+	stat->sum += now.stamp;
+}
+
+#else
+
+/*
+ * Here, a set of empty macros provides for nice ifdef free callers into
+ * this statistics code. If CONFIG_NETFILTER_HOOK_STAT is NOT defined,
+ * these should make the compiled code identical to what we had before.
+ */
+struct nf_stat_hook_sample {};
+#define nf_stat_hook_begin(a) do{}while(0)
+#define nf_stat_hook_end(a,b,c) do{}while(0)
+#define nf_stat_hook_register(a) do{}while(0)
+#define nf_stat_hook_unregister(a) do{}while(0)
+#define nf_stat_hook_setup() do{}while(0)
+
+/*
+ * End of new statistics stuff. On with the traditional net/core/netfilter.c
+ * Search below for "nf_stat_hook" to see where we call into the statistics.
+ */
+#endif
+
 /* 
  * A queue handler may be registered for each protocol.  Each is protected by
  * long term mutex.  The handler must provide an an outfn() to accept packets
@@ -68,6 +366,7 @@
 		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
 			break;
 	}
+	nf_stat_hook_register(reg);
 	list_add(&reg->list, i->prev);
 	br_write_unlock_bh(BR_NETPROTO_LOCK);
 	return 0;
@@ -77,6 +376,7 @@
 {
 	br_write_lock_bh(BR_NETPROTO_LOCK);
 	list_del(&reg->list);
+	nf_stat_hook_unregister(reg);
 	br_write_unlock_bh(BR_NETPROTO_LOCK);
 }
 
@@ -346,14 +646,19 @@
 {
 	for (*i = (*i)->next; *i != head; *i = (*i)->next) {
 		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+		struct nf_stat_hook_sample sample;
+		nf_stat_hook_begin(&sample);
 		switch (elem->hook(hook, skb, indev, outdev, okfn)) {
 		case NF_QUEUE:
+			nf_stat_hook_end(&sample, elem, NF_QUEUE);
 			return NF_QUEUE;
 
 		case NF_STOLEN:
+			nf_stat_hook_end(&sample, elem, NF_STOLEN);
 			return NF_STOLEN;
 
 		case NF_DROP:
+			nf_stat_hook_end(&sample, elem, NF_DROP);
 			return NF_DROP;
 
 		case NF_REPEAT:
@@ -369,6 +674,7 @@
 				elem->hook, hook);
 #endif
 		}
+		nf_stat_hook_end(&sample, elem, NF_ACCEPT);
 	}
 	return NF_ACCEPT;
 }
@@ -638,4 +944,5 @@
 		for (h = 0; h < NF_MAX_HOOKS; h++)
 			INIT_LIST_HEAD(&nf_hooks[i][h]);
 	}
+	nf_stat_hook_setup();
 }

--DocE+STaALJfprDB--