[PATCH] x_tables : Use SMP friendly (percpu) rwlock_t to protect
x_tables
Eric Dumazet
dada1 at cosmosbay.com
Fri Jan 27 16:16:15 CET 2006
This patch helps scalability of x_tables on SMP, especially NUMA machines.
Instead of using a single rwlock_t to protect x_tables, use an array of
rwlock_t, one for each possible cpu in the machine.
1) No more cache line ping pongs between cpus. read_lock_bh(&table->lock) /
read_unlock_bh(&table->lock) were still expensive because of the very hot
central rwlock.
2) get_counters() is more latency friendly than before (we lock each sub table
one at a time instead of the full x_tables)
3) When a replace of table must be done, the 'writer' have to write_lock_bh()
all the locks of the array. This is 'expensive' but seldom done (underlying
vmalloc/copy_from_user costs are more expensive anyway)
This patch was tested on a 4 way Opteron machine, two gigabit links, with
rather complex iptables rules (around 200 rules) and gives excellent results
so far.
Thank you
Eric Dumazet
Signed-off-by: Eric Dumazet <dada1 at cosmosbay.com>
-------------- next part --------------
--- linux-2.6.16-rc1/include/linux/netfilter/x_tables.h 2006-01-27 16:38:08.000000000 +0100
+++ linux-2.6.16-rc1-edinclude/linux/netfilter/x_tables.h 2006-01-27 16:39:07.000000000 +0100
@@ -170,7 +170,13 @@
unsigned int valid_hooks;
/* Lock for the curtain */
+#ifdef CONFIG_SMP
+ rwlock_t *lockp;
+# define xt_table_lockaddr(X,cpu) per_cpu_ptr((X)->lockp, cpu)
+#else
rwlock_t lock;
+# define xt_table_lockaddr(X,cpu) (&(X)->lock)
+#endif
/* Man behind the curtain... */
//struct ip6t_table_info *private;
@@ -207,6 +213,7 @@
extern int xt_register_match(int af, struct xt_match *target);
extern void xt_unregister_match(int af, struct xt_match *target);
+extern int xt_table_global_init(struct xt_table *table);
extern int xt_register_table(struct xt_table *table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo);
--- linux-2.6.16-rc1/net/ipv4/netfilter/ip_tables.c 2006-01-27 15:11:58.000000000 +0100
+++ linux-2.6.16-rc1-ednet/ipv4/netfilter/ip_tables.c 2006-01-27 16:39:28.000000000 +0100
@@ -230,6 +230,8 @@
void *table_base;
struct ipt_entry *e, *back;
struct xt_table_info *private = table->private;
+ int curcpu;
+ rwlock_t *lockp;
/* Initialization */
ip = (*pskb)->nh.iph;
@@ -244,9 +246,11 @@
* match it. */
offset = ntohs(ip->frag_off) & IP_OFFSET;
- read_lock_bh(&table->lock);
+ curcpu = get_cpu();
+ lockp = xt_table_lockaddr(table, curcpu);
+ read_lock_bh(lockp);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)private->entries[smp_processor_id()];
+ table_base = (void *)private->entries[curcpu];
e = get_entry(table_base, private->hook_entry[hook]);
/* For return from builtin chain */
@@ -336,7 +340,8 @@
}
} while (!hotdrop);
- read_unlock_bh(&table->lock);
+ read_unlock_bh(lockp);
+ put_cpu();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -758,7 +763,8 @@
static void
get_counters(const struct xt_table_info *t,
- struct xt_counters counters[])
+ struct xt_counters counters[],
+ struct xt_table *table)
{
unsigned int cpu;
unsigned int i;
@@ -767,26 +773,34 @@
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
- * We dont care about preemption here.
*/
- curcpu = raw_smp_processor_id();
+ curcpu = get_cpu();
+ if (table)
+ write_lock_bh(xt_table_lockaddr(table, curcpu));
i = 0;
IPT_ENTRY_ITERATE(t->entries[curcpu],
t->size,
set_entry_to_counter,
counters,
&i);
+ if (table)
+ write_unlock_bh(xt_table_lockaddr(table, curcpu));
+ put_cpu();
for_each_cpu(cpu) {
if (cpu == curcpu)
continue;
+ if (table)
+ write_lock_bh(xt_table_lockaddr(table, cpu));
i = 0;
IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
+ if (table)
+ write_unlock_bh(xt_table_lockaddr(table, cpu));
}
}
@@ -812,9 +826,7 @@
return -ENOMEM;
/* First, sum counters... */
- write_lock_bh(&table->lock);
- get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ get_counters(private, counters, table);
/* choose the copy that is on our node/cpu, ...
* This choice is lazy (because current thread is
@@ -977,7 +989,7 @@
module_put(t->me);
/* Get the old counters. */
- get_counters(oldinfo, counters);
+ get_counters(oldinfo, counters, NULL);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
@@ -1032,6 +1044,7 @@
struct xt_table_info *private;
int ret = 0;
void *loc_cpu_entry;
+ int cpu;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1054,7 +1067,8 @@
goto free;
}
- write_lock_bh(&t->lock);
+ cpu = get_cpu();
+ write_lock_bh(xt_table_lockaddr(t, cpu));
private = t->private;
if (private->number != paddc->num_counters) {
ret = -EINVAL;
@@ -1063,14 +1077,15 @@
i = 0;
/* Choose the copy that is on our node */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries[cpu];
IPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc->counters,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ write_unlock_bh(xt_table_lockaddr(t, cpu));
+ put_cpu();
xt_table_unlock(t);
module_put(t->me);
free:
@@ -1215,10 +1230,12 @@
= { 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;
+ ret = xt_table_global_init(table);
+ if (ret)
+ return ret;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
-
/* choose the copy on our node/cpu
* but dont care of preemption
*/
--- linux-2.6.16-rc1/net/netfilter/x_tables.c 2006-01-27 15:36:26.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/netfilter/x_tables.c 2006-01-27 16:40:56.000000000 +0100
@@ -309,6 +309,44 @@
}
EXPORT_SYMBOL_GPL(xt_table_unlock);
+/*
+ * Dont use write_lock_bh() in the loop, because NR_CPUS
+ * might be very large and preempt_count could overflow
+ */
+static void xt_table_global_lock(struct xt_table *table)
+{
+ int cpu;
+ local_bh_disable();
+ preempt_disable();
+ for_each_cpu(cpu) {
+ _raw_write_lock(xt_table_lockaddr(table, cpu));
+ }
+}
+
+static void xt_table_global_unlock(struct xt_table *table)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ _raw_write_unlock(xt_table_lockaddr(table, cpu));
+ }
+ preempt_enable_no_resched();
+ local_bh_enable();
+}
+
+int xt_table_global_init(struct xt_table *table)
+{
+ int cpu;
+#ifdef CONFIG_SMP
+ if (!table->lockp) {
+ table->lockp = alloc_percpu(rwlock_t);
+ if (!table->lockp)
+ return -ENOMEM;
+ }
+#endif
+ for_each_cpu(cpu)
+ rwlock_init(xt_table_lockaddr(table, cpu));
+ return 0;
+}
struct xt_table_info *
xt_replace_table(struct xt_table *table,
@@ -319,20 +357,20 @@
struct xt_table_info *oldinfo, *private;
/* Do the substitution. */
- write_lock_bh(&table->lock);
+ xt_table_global_lock(table);
private = table->private;
/* Check inside lock: is the old number correct? */
if (num_counters != private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
- write_unlock_bh(&table->lock);
+ xt_table_global_unlock(table);
*error = -EAGAIN;
return NULL;
}
oldinfo = private;
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;
- write_unlock_bh(&table->lock);
+ xt_table_global_unlock(table);
return oldinfo;
}
@@ -366,7 +404,6 @@
/* save number of initial entries */
private->initial_entries = private->number;
- rwlock_init(&table->lock);
list_prepend(&xt[table->af].tables, table);
ret = 0;
--- linux-2.6.16-rc1/net/ipv4/netfilter/ip_nat_rule.c 2006-01-27 16:10:50.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv4/netfilter/ip_nat_rule.c 2006-01-27 16:32:11.000000000 +0100
@@ -93,7 +93,6 @@
static struct ipt_table nat_table = {
.name = "nat",
.valid_hooks = NAT_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
.af = AF_INET,
};
--- linux-2.6.16-rc1/net/ipv4/netfilter/iptable_mangle.c 2006-01-27 16:11:00.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv4/netfilter/iptable_mangle.c 2006-01-27 16:32:11.000000000 +0100
@@ -107,7 +107,6 @@
static struct ipt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
.af = AF_INET,
};
--- linux-2.6.16-rc1/net/ipv4/netfilter/iptable_raw.c 2006-01-27 16:11:37.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv4/netfilter/iptable_raw.c 2006-01-27 16:11:52.000000000 +0100
@@ -82,7 +82,6 @@
static struct ipt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
.af = AF_INET,
};
--- linux-2.6.16-rc1/net/ipv4/netfilter/arptable_filter.c 2006-01-27 16:20:26.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv4/netfilter/arptable_filter.c 2006-01-27 16:22:59.000000000 +0100
@@ -142,7 +142,6 @@
static struct arpt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.private = NULL,
.me = THIS_MODULE,
.af = NF_ARP,
--- linux-2.6.16-rc1/net/ipv6/netfilter/ip6_tables.c 2006-01-27 16:21:56.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv6/netfilter/ip6_tables.c 2006-01-27 16:43:42.000000000 +0100
@@ -283,6 +283,8 @@
void *table_base;
struct ip6t_entry *e, *back;
struct xt_table_info *private;
+ int curcpu;
+ rwlock_t *lockp;
/* Initialization */
indev = in ? in->name : nulldevname;
@@ -294,10 +296,12 @@
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
- read_lock_bh(&table->lock);
+ curcpu = get_cpu();
+ lockp = xt_table_lockaddr(table, curcpu);
+ read_lock_bh(lockp);
private = table->private;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)private->entries[smp_processor_id()];
+ table_base = (void *)private->entries[curcpu];
e = get_entry(table_base, private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -403,7 +407,8 @@
#ifdef CONFIG_NETFILTER_DEBUG
((struct ip6t_entry *)table_base)->comefrom = 0xdead57ac;
#endif
- read_unlock_bh(&table->lock);
+ read_unlock_bh(lockp);
+ put_cpu();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -825,7 +830,8 @@
static void
get_counters(const struct xt_table_info *t,
- struct xt_counters counters[])
+ struct xt_counters counters[],
+ struct xt_table *table)
{
unsigned int cpu;
unsigned int i;
@@ -834,26 +840,34 @@
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
- * We dont care about preemption here.
*/
- curcpu = raw_smp_processor_id();
+ curcpu = get_cpu();
+ if (table)
+ write_lock_bh(xt_table_lockaddr(table, curcpu));
i = 0;
IP6T_ENTRY_ITERATE(t->entries[curcpu],
t->size,
set_entry_to_counter,
counters,
&i);
+ if (table)
+ write_unlock_bh(xt_table_lockaddr(table, curcpu));
+ put_cpu();
for_each_cpu(cpu) {
if (cpu == curcpu)
continue;
+ if (table)
+ write_lock_bh(xt_table_lockaddr(table, cpu));
i = 0;
IP6T_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
+ if (table)
+ write_unlock_bh(xt_table_lockaddr(table, cpu));
}
}
@@ -879,9 +893,7 @@
return -ENOMEM;
/* First, sum counters... */
- write_lock_bh(&table->lock);
- get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ get_counters(private, counters, table);
/* choose the copy that is on ourc node/cpu */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1034,7 +1046,7 @@
module_put(t->me);
/* Get the old counters. */
- get_counters(oldinfo, counters);
+ get_counters(oldinfo, counters, NULL);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
@@ -1089,6 +1101,7 @@
struct xt_table *t;
int ret = 0;
void *loc_cpu_entry;
+ int cpu;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1111,7 +1124,8 @@
goto free;
}
- write_lock_bh(&t->lock);
+ cpu = get_cpu();
+ write_lock_bh(xt_table_lockaddr(t, cpu));
private = t->private;
if (private->number != paddc->num_counters) {
ret = -EINVAL;
@@ -1120,14 +1134,15 @@
i = 0;
/* Choose the copy that is on our node */
- loc_cpu_entry = private->entries[smp_processor_id()];
+ loc_cpu_entry = private->entries[cpu];
IP6T_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc->counters,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ write_unlock_bh(xt_table_lockaddr(t, cpu));
+ put_cpu();
xt_table_unlock(t);
module_put(t->me);
free:
--- linux-2.6.16-rc1/net/ipv6/netfilter/ip6table_filter.c 2006-01-27 16:25:54.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv6/netfilter/ip6table_filter.c 2006-01-27 16:26:05.000000000 +0100
@@ -95,7 +95,6 @@
static struct ip6t_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
.af = AF_INET6,
};
--- linux-2.6.16-rc1/net/ipv6/netfilter/ip6table_raw.c 2006-01-27 16:26:57.000000000 +0100
+++ linux-2.6.16-rc1-ed/net/ipv6/netfilter/ip6table_raw.c 2006-01-27 16:27:12.000000000 +0100
@@ -109,7 +109,6 @@
static struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
.af = AF_INET6,
};
More information about the netfilter-devel
mailing list