Before this patch, fault injection uses a combination of randomness and
frequency to determine where to inject faults. The problem with this is
that code paths which are executed very rarely get proportional amounts
of faults injected.

A better heuristic is to look at the actual callchain leading up to the
possible failure point; if we see a callchain that we've never seen up
until this point, chances are it's a rare one and we should definitely
inject a fault here (since we might not get the chance again later).

This uses a probabilistic set structure (similar to a bloom filter) to
determine whether we have seen a particular callchain before by hashing
the stack trace and atomically testing/setting a bit corresponding to
the current callchain.

There is a possibility of false negatives (i.e. we think we have seen a
particular callchain before when in fact we haven't, therefore we don't
inject a fault where we should have). We might use some sort of random
seed here, but the additional complexity doesn't seem worth it to me.

This finds a lot more bugs than just plain fault injection.

Signed-off-by: Vegard Nossum <vegard.nos...@oracle.com>
---
 lib/Kconfig.debug  | 29 +++++++++++++++++++++++++++++
 lib/fault-inject.c | 36 +++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 52f7e14..9e81720 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1701,6 +1701,35 @@ config FAULT_INJECTION_STACKTRACE_FILTER
        help
          Provide stacktrace filter for fault-injection capabilities
 
+config FAULT_INJECTION_AT_NEW_CALLSITES
+       bool "Inject fault the first time at a new callsite"
+       depends on FAULT_INJECTION_STACKTRACE_FILTER
+       help
+         Without this, fault injection uses a combination of randomness
+         and frequency to determine where to inject faults. The problem
+         with this is that code paths which are executed very rarely get
+         proportional amounts of faults injected.
+
+         A better heuristic is to look at the actual callchain leading
+         up to the possible failure point; if we see a callchain that
+         we've never seen up until this point, chances are it's a rare
+         one and we should definitely inject a fault here (since we
+         might not get the chance again later).
+
+         This uses a probabilistic set structure (similar to a bloom
+         filter) to determine whether we have seen a particular
+         callchain before by hashing the stack trace and atomically
+         testing/setting a bit corresponding to the current callchain.
+
+         There is a possibility of false negatives (i.e. we think we
+         have seen a particular callchain before when in fact we
+         haven't, therefore we don't inject a fault where we should
+         have). We might use some sort of random seed here, but the
+         additional complexity doesn't seem worth it to me.
+
+         This finds a lot more bugs than just plain fault injection,
+         but comes with a small additional overhead.
+
 config LATENCYTOP
        bool "Latency measuring infrastructure"
        depends on DEBUG_KERNEL
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index adba7c9..5ad11dd 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -63,7 +63,7 @@ static bool fail_task(struct fault_attr *attr, struct 
task_struct *task)
 
 #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
 
-static bool fail_stacktrace(struct fault_attr *attr)
+static bool fail_stacktrace(struct fault_attr *attr, unsigned int *hash)
 {
        struct stack_trace trace;
        int depth = attr->stacktrace_depth;
@@ -88,12 +88,20 @@ static bool fail_stacktrace(struct fault_attr *attr)
                               entries[n] < attr->require_end)
                        found = true;
        }
+
+       if (IS_ENABLED(CONFIG_FAULT_INJECTION_AT_NEW_CALLSITES)) {
+               const char *start = (const char *) &entries[0];
+               const char *end = (const char *) &entries[trace.nr_entries];
+
+               *hash = full_name_hash(0, start, end - start);
+       }
+
        return found;
 }
 
 #else
 
-static inline bool fail_stacktrace(struct fault_attr *attr)
+static inline bool fail_stacktrace(struct fault_attr *attr, unsigned int *hash)
 {
        return true;
 }
@@ -134,6 +142,8 @@ out:
 
 bool should_fail(struct fault_attr *attr, ssize_t size)
 {
+       unsigned int hash = 0;
+
        /* No need to check any other properties if the probability is 0 */
        if (attr->probability == 0)
                return false;
@@ -149,6 +159,24 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
                return false;
        }
 
+       if (!fail_stacktrace(attr, &hash))
+               return false;
+
+       if (IS_ENABLED(CONFIG_FAULT_INJECTION_AT_NEW_CALLSITES)) {
+               static unsigned long seen_hashtable[4 * 1024];
+
+               hash &= 8 * sizeof(seen_hashtable) - 1;
+               if (!test_and_set_bit(hash & (BITS_PER_LONG - 1),
+                       &seen_hashtable[hash / BITS_PER_LONG]))
+               {
+                       /*
+                        * If it's the first time we see this stacktrace, fail 
it
+                        * without a second thought.
+                        */
+                       goto fail;
+               }
+       }
+
        if (attr->interval > 1) {
                attr->count++;
                if (attr->count % attr->interval)
@@ -158,9 +186,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
        if (attr->probability <= prandom_u32() % 100)
                return false;
 
-       if (!fail_stacktrace(attr))
-               return false;
-
+fail:
        return __fail(attr);
 }
 EXPORT_SYMBOL_GPL(should_fail);
-- 
1.9.1

Reply via email to