[PATCH][CFT] netfilter hook statistics, third take
Patrick Schaaf
bof@bof.de
Mon, 29 Jul 2002 09:50:48 +0200
--DocE+STaALJfprDB
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Mainly commenting and cleanup; now has Configure.help. Functionally unchanged.
--DocE+STaALJfprDB
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="bof-nf-hookstat-20020729.patch"
# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.654 -> 1.658
# net/Config.in 1.10 -> 1.11
# net/core/netfilter.c 1.7 -> 1.11
# include/linux/netfilter.h 1.2 -> 1.3
# Documentation/Configure.help 1.110 -> 1.111
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/07/27 bof@cdr.(none) 1.655
# Config.in, netfilter.h, netfilter.c:
# netfilter hook statistics
# --------------------------------------------
# 02/07/28 bof@cdr.(none) 1.656
# net/core/netfilter.c:
# slabify, make per-cpu counters.
# --------------------------------------------
# 02/07/28 bof@cdr.(none) 1.657
# net/core/netfilter.c:
# remove debug printks related to slabifying hook statistic counters.
# --------------------------------------------
# 02/07/29 bof@cdr.(none) 1.658
# netfilter.c:
# some more comments, minimal cleanup, KERN_NOTICE upon (un)registration.
# Configure.help:
# friendly help and advise regarding CONFIG_NETFILTER_HOOK_STAT
# --------------------------------------------
#
diff -Nru a/Documentation/Configure.help b/Documentation/Configure.help
--- a/Documentation/Configure.help Mon Jul 29 09:26:48 2002
+++ b/Documentation/Configure.help Mon Jul 29 09:26:48 2002
@@ -2429,6 +2429,23 @@
You can say Y here if you want to get additional messages useful in
debugging the netfilter code.
+Netfilter hook statistics
+CONFIG_NETFILTER_HOOK_STAT
+ If you say Y here, the time spent in the various netfilter hook
+ functions is measured, using the TSC of your processor. Your
+ kernel won't boot when you don't have a working TSC.
+ Say N when you don't have a modern Intel/AMD processor.
+
+ When enabled, look at /proc/net/nf_stat_hook_* for the actual
+ measurement results, presented in a format easy to guess by
+ any well-calibrated crystal ball.
+
+ The timing imposes a processing overhead that may be relevant
+ on machines with high packet rates. The overhead is estimated
+ at about 5% of the time used by the hook functions, themselves.
+
+ The safe thing is to say N.
+
Connection tracking (required for masq/NAT)
CONFIG_IP_NF_CONNTRACK
Connection tracking keeps a record of what packets have passed
diff -Nru a/include/linux/netfilter.h b/include/linux/netfilter.h
--- a/include/linux/netfilter.h Mon Jul 29 09:26:48 2002
+++ b/include/linux/netfilter.h Mon Jul 29 09:26:48 2002
@@ -51,6 +51,9 @@
int hooknum;
/* Hooks are ordered in ascending priority. */
int priority;
+#ifdef CONFIG_NETFILTER_HOOK_STAT
+ void *hook_stat;
+#endif
};
struct nf_sockopt_ops
diff -Nru a/net/Config.in b/net/Config.in
--- a/net/Config.in Mon Jul 29 09:26:48 2002
+++ b/net/Config.in Mon Jul 29 09:26:48 2002
@@ -13,6 +13,7 @@
bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER
if [ "$CONFIG_NETFILTER" = "y" ]; then
bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG
+ bool ' Netfilter hook statistics' CONFIG_NETFILTER_HOOK_STAT
fi
bool 'Socket Filtering' CONFIG_FILTER
tristate 'Unix domain sockets' CONFIG_UNIX
diff -Nru a/net/core/netfilter.c b/net/core/netfilter.c
--- a/net/core/netfilter.c Mon Jul 29 09:26:48 2002
+++ b/net/core/netfilter.c Mon Jul 29 09:26:48 2002
@@ -47,6 +47,304 @@
struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
static LIST_HEAD(nf_sockopts);
+#ifdef CONFIG_NETFILTER_HOOK_STAT
+
+/*
+ * menuconfig this under "Network options" >> "Netfilter hook statistics"
+ *
+ * The following code, up to the next #endif, implements per hook
+ * statistics counting. If enabled, look at /proc/net/nf_stat_hook*
+ * for the results.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <asm/msr.h>
+
+/*
+ * nf_stat_hook_proc[pf][hooknum] is a flag per protocol/hook, telling
+ * whether we have already created the /proc/net/nf_stat_hook_X.Y file.
+ * The array is only consulted during module registration. This code
+ * never removes the proc files; when all hook functions unregister,
+ * an empty file remains.
+ *
+ * Not used under normal per-packet processing.
+ */
+static unsigned char nf_stat_hook_proc[NPROTO][NF_MAX_HOOKS];
+
+/*
+ * struct nf_stat_hook_sample is used in nf_inject(), to record the
+ * beginning of the operation. After calling the hook function,
+ * it is reused to compute the duration of the hook function call,
+ * which is then recorded in nf_hook_ops->stat[percpu].
+ *
+ * CPU-local data on the stack, unshared.
+ */
+struct nf_stat_hook_sample {
+ unsigned long long stamp;
+};
+
+/*
+ * struct nf_stat_hook is our main statistics state structure.
+ * It is kept cache-aligned and per-cpu, summing the per-cpu
+ * values only when read through the /proc interface.
+ *
+ * CPU-local data, read across all CPUs only on user request.
+ * Updated locally on each CPU, one update per packet and hook function.
+ */
+struct nf_stat_hook {
+ unsigned long long count;
+ unsigned long long sum;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+/*
+ * The nf_stat_hook structures come from our private slab cache.
+ */
+static kmem_cache_t *nf_stat_hook_slab;
+
+/*
+ * nf_stat_hook_zero() is the slab ctor/dtor
+ */
+static void nf_stat_hook_zero(void *data, kmem_cache_t *slab, unsigned long x)
+{
+ struct nf_stat_hook *stat = data;
+ int i;
+
+ for (i=0; i<NR_CPUS; i++,stat++)
+ stat->count = stat->sum = 0;
+}
+
+/*
+ * nf_stat_hook_setup() is the one-time initialization routine.
+ * It allocates the slab cache for our statistics counters,
+ * and initializes the "proc registration" flag array.
+ */
+static void __init nf_stat_hook_setup(void)
+{
+ /* early rdtsc to catch booboo at boot time */
+ { struct nf_stat_hook_sample sample; rdtscll(sample.stamp); }
+
+ nf_stat_hook_slab = kmem_cache_create("nf_stat_hook",
+ NR_CPUS * sizeof(struct nf_stat_hook),
+ 0, SLAB_HWCACHE_ALIGN,
+ nf_stat_hook_zero, nf_stat_hook_zero);
+ if (!nf_stat_hook_slab)
+ printk(KERN_ERR "nf_stat_hook will NOT WORK - no slab.\n");
+
+ memset(nf_stat_hook_proc, 0, sizeof(nf_stat_hook_proc));
+}
+
+/*
+ * nf_stat_hook_read_proc() is a proc_fs read_proc() callback.
+ * Called per protocol/hook, the statistics of all netfilter
+ * hook elements sitting on that hook, are shown, in priority
+ * order. On SMP, the per-cpu counters are summed here.
+ * For accuracy, maybe we need to take some write lock. Later.
+ *
+ * Readings might look strange, until such locking is done.
+ * If you need to compensate, read several times, and throw
+ * out the strange results. Look for silly non-monotony.
+ *
+ * Output fields are seperated by a single blank, and represent:
+ * [0] address of 'struct nf_hook_ops'. (pointer, in unadorned 8-byte hex)
+ * [1] address of nf_hook_ops->hook() function pointer. When the
+ * hook module is built into the kernel, you can find this
+ * in System.map. (pointer, in unadorned 8-byte hex)
+ * [2] hook priority. (signed integer, in ascii)
+ * [3] number of times hook was called. (unsigned 64 bit integer, in ascii)
+ * [4] total number of cycles spent in the hook function, measured by
+ * summing the rdtscll() differences across the calls. (unsigned
+ * 64 bit integer, in ascii)
+ *
+ * Additional fields may be added in the future; if any field is eventually
+ * retired, it will be set to neutral values: '00000000' for the pointer
+ * fields, and '0' for the integer fields. That's theory, not guarantee. :)
+ */
+static int nf_stat_hook_read_proc(
+ char *page,
+ char **start,
+ off_t off,
+ int count,
+ int *eof,
+ void *data
+) {
+ struct list_head *l;
+ int res;
+
+ for ( res = 0, l = ((struct list_head *)data)->next;
+ l != data;
+ l = l->next
+ ) {
+ int i;
+ struct nf_hook_ops *elem = (struct nf_hook_ops *) l;
+ struct nf_stat_hook *stat = elem->hook_stat;
+
+ if (stat) {
+ unsigned long long count;
+ unsigned long long sum;
+ /* maybe write_lock something here */
+ for (i=0, count=0, sum=0; i<NR_CPUS; i++, stat++) {
+ count += stat->count;
+ sum += stat->sum;
+ }
+ /* and then write_unlock it here */
+ i = sprintf(page+res, "%p %p %d %Lu %Lu\n",
+ elem, elem->hook, elem->priority,
+ count, sum);
+ } else {
+ i = sprintf(page+res, "%p %p %d 0 0\n",
+ elem, elem->hook, elem->priority);
+ }
+ if (i <= 0)
+ break;
+ res += i;
+ }
+ return res;
+}
+
+/*
+ * nf_stat_hook_register() is called whenever a hook element registers.
+ * When neccessary, we create a /proc/net/nf_stat_hook* file here,
+ * and we always allocate one struct nf_stat_hook.
+ */
+static void nf_stat_hook_register(struct nf_hook_ops *elem)
+{
+ elem->hook_stat = (NULL == nf_stat_hook_slab)
+ ? 0 : kmem_cache_alloc(nf_stat_hook_slab, SLAB_ATOMIC);
+ if (!elem->hook_stat) return;
+ if (!nf_stat_hook_proc[elem->pf][elem->hooknum]) {
+ char buf[64];
+ char hookname_buf[16];
+ char pfname_buf[16];
+ char *hookname;
+ char *pfname;
+ struct proc_dir_entry *proc;
+
+ switch(elem->pf) {
+ case 2:
+ pfname = "ipv4";
+ switch(elem->hooknum) {
+ case 0:
+ hookname = "PRE-ROUTING";
+ break;
+ case 1:
+ hookname = "LOCAL-IN";
+ break;
+ case 2:
+ hookname = "FORWARD";
+ break;
+ case 3:
+ hookname = "LOCAL-OUT";
+ break;
+ case 4:
+ hookname = "POST-ROUTING";
+ break;
+ default:
+ sprintf(hookname_buf, "hook%d",
+ elem->hooknum);
+ hookname = hookname_buf;
+ break;
+ }
+ break;
+ default:
+ sprintf(hookname_buf, "hook%d",
+ elem->hooknum);
+ hookname = hookname_buf;
+ sprintf(pfname_buf, "pf%d",
+ elem->pf);
+ pfname = pfname_buf;
+ break;
+ }
+ sprintf(buf, "net/nf_stat_hook_%s.%s", pfname, hookname);
+ proc = create_proc_read_entry(buf, 0644, NULL,
+ nf_stat_hook_read_proc,
+ &nf_hooks[elem->pf][elem->hooknum]
+ );
+ if (!proc) {
+ printk(KERN_ERR "cannot create %s\n", buf);
+ kmem_cache_free(nf_stat_hook_slab, elem->hook_stat);
+ elem->hook_stat = 0;
+ return;
+ }
+ proc->owner = THIS_MODULE;
+ }
+ nf_stat_hook_proc[elem->pf][elem->hooknum]++;
+ printk(KERN_NOTICE "nf_stat_hook %d/%d START %p [%d]\n",
+ elem->pf, elem->hooknum,
+ elem->hook,
+ nf_stat_hook_proc[elem->pf][elem->hooknum]);
+}
+
+/*
+ * nf_stat_hook_unregister() is called when a hook element unregisters.
+ * The statistics structure is freed, but we NEVER remove the /proc/net
+ * file entry. Maybe we should. nf_stat_hook_proc[][] contains the correct
+ * counter, I think (modulo races).
+ */
+static void nf_stat_hook_unregister(struct nf_hook_ops *elem)
+{
+ if (elem->hook_stat)
+ kmem_cache_free(nf_stat_hook_slab, elem->hook_stat);
+ nf_stat_hook_proc[elem->pf][elem->hooknum]--;
+ printk(KERN_NOTICE "nf_stat_hook %d/%d STOP %p [%d]\n",
+ elem->pf, elem->hooknum,
+ elem->hook,
+ nf_stat_hook_proc[elem->pf][elem->hooknum]);
+ if (nf_stat_hook_proc[elem->pf][elem->hooknum] == 0)
+ printk(KERN_NOTICE "nf_stat_hook %d/%d empty.\n",
+ elem->pf, elem->hooknum);
+}
+
+/*
+ * Finally, the next two functions implement the real timekeeping.
+ * If rdtscll() proves problematic, these have to be changed.
+ * The _begin() function is called before a specific hook entry
+ * function gets called - it starts the timer.
+ * The _end() function is called after the hook entry function,
+ * and it stops the timer, and remembers the interval in the
+ * statistics structure (per-cpu).
+ */
+
+static inline void nf_stat_hook_begin(struct nf_stat_hook_sample *sample)
+{
+ rdtscll(sample->stamp);
+}
+
+static inline void nf_stat_hook_end(
+ struct nf_stat_hook_sample *sample,
+ struct nf_hook_ops *elem,
+ int verdict
+) {
+ struct nf_stat_hook *stat = elem->hook_stat;
+ struct nf_stat_hook_sample now;
+ if (!stat) return;
+ rdtscll(now.stamp); now.stamp -= sample->stamp;
+ stat += smp_processor_id();
+ stat->count++;
+ stat->sum += now.stamp;
+}
+
+#else
+
+/*
+ * Here, a set of empty macros provides for nice ifdef free callers into
+ * this statistics code. If CONFIG_NETFILTER_HOOK_STAT is NOT defined,
+ * these should make the compiled code identical to what we had before.
+ */
+struct nf_stat_hook_sample {};
+#define nf_stat_hook_begin(a) do{}while(0)
+#define nf_stat_hook_end(a,b,c) do{}while(0)
+#define nf_stat_hook_register(a) do{}while(0)
+#define nf_stat_hook_unregister(a) do{}while(0)
+#define nf_stat_hook_setup() do{}while(0)
+
+/*
+ * End of new statistics stuff. On with the traditional net/core/netfilter.c
+ * Search below for "nf_stat_hook" to see where we call into the statistics.
+ */
+#endif
+
/*
* A queue handler may be registered for each protocol. Each is protected by
* long term mutex. The handler must provide an an outfn() to accept packets
@@ -68,6 +366,7 @@
if (reg->priority < ((struct nf_hook_ops *)i)->priority)
break;
}
+ nf_stat_hook_register(reg);
list_add(®->list, i->prev);
br_write_unlock_bh(BR_NETPROTO_LOCK);
return 0;
@@ -77,6 +376,7 @@
{
br_write_lock_bh(BR_NETPROTO_LOCK);
list_del(®->list);
+ nf_stat_hook_unregister(reg);
br_write_unlock_bh(BR_NETPROTO_LOCK);
}
@@ -346,14 +646,19 @@
{
for (*i = (*i)->next; *i != head; *i = (*i)->next) {
struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+ struct nf_stat_hook_sample sample;
+ nf_stat_hook_begin(&sample);
switch (elem->hook(hook, skb, indev, outdev, okfn)) {
case NF_QUEUE:
+ nf_stat_hook_end(&sample, elem, NF_QUEUE);
return NF_QUEUE;
case NF_STOLEN:
+ nf_stat_hook_end(&sample, elem, NF_STOLEN);
return NF_STOLEN;
case NF_DROP:
+ nf_stat_hook_end(&sample, elem, NF_DROP);
return NF_DROP;
case NF_REPEAT:
@@ -369,6 +674,7 @@
elem->hook, hook);
#endif
}
+ nf_stat_hook_end(&sample, elem, NF_ACCEPT);
}
return NF_ACCEPT;
}
@@ -638,4 +944,5 @@
for (h = 0; h < NF_MAX_HOOKS; h++)
INIT_LIST_HEAD(&nf_hooks[i][h]);
}
+ nf_stat_hook_setup();
}
--DocE+STaALJfprDB--