[Neo] Re: Skript zur Messung von Ngram-Dauern (Gedankenpausen?)

Florian Thaler Thu, 21 Aug 2025 13:37:37 -0700

Liebe Leute,

Ich habe noch eine kleine Sache behoben: Bisher wurden die vielen Filesaus individuellen Sessions nicht erfolgreich zusammengeführt, weil icheinfach den falschen Speicherort angegeben hatte.Eure Daten sind zum Glück nicht weg, sie wurden einfach nur noch nichtzusammengeführt. Das behoben Skript schafft das (siehe Anhang), einfachan denselben Ort legen und auf dieselbe Weise starten.

Änderungen sind hier nachzuverfolgen, ich hoffe aber, dass es nichtviele weitere brauchen wird :)

https://github.com/Glitchy-Tozier/neo_ngram_duration_logger/commit/e0d0a917815e4a259ac2774703d249ddd0cf4d15

Liebe Grüße,
Florian

#!/usr/bin/env python3

import importlib
import sys

# ---------- 1. Check for missing packages ----------
required_packages = ["pynput"]
missing_packages = []

for pkg in required_packages:
    try:
        importlib.import_module(pkg)
    except ImportError:
        missing_packages.append(pkg)

if missing_packages:
    print("The following packages are missing:", ", ".join(missing_packages))
    print("You can install them all with this command:")
    print(f"pip install {' '.join(missing_packages)}")
    sys.exit(1)

# ---------- 2. Standard imports ----------
import argparse
import csv
import os
from datetime import datetime
from pynput import keyboard
import time
import threading
import random  # for shuffling rows
import glob
import json
import ast

# ---------- 3. Command-line argument ----------
parser = argparse.ArgumentParser(description="Keylogger for bigram and trigram durations")
parser.add_argument("--output-dir", default="./neo_ngram_durations", help="Directory to store log files")
args = parser.parse_args()
output_dir = args.output_dir
os.makedirs(output_dir, exist_ok=True)

# ---------- 4. File setup ----------
individual_runs = os.path.join(output_dir, "individual_runs")
os.makedirs(individual_runs, exist_ok=True)
timestamp = datetime.now().strftime("%y%m%d_%H%M%S")
bigram_file = os.path.join(individual_runs, f"bigrams_{timestamp}.csv")
trigram_file = os.path.join(individual_runs, f"trigrams_{timestamp}.csv")

bigram_durations = {}   # {bigram: [durations]}
trigram_durations = {}  # {trigram: [durations]}
key_buffer = []         # store recent keys
time_buffer = []        # store recent press times
last_time = None
last_flush = time.time()

# create the lock before any thread uses it (fixes race on undefined 'lock')
lock = threading.Lock()
stop_event = threading.Event()   # <-- added for clean shutdown

# ---------- 5. CSV Initialization ----------
def init_csv_files():
    for file_path, header in [(bigram_file, ["bigram", "durations"]), 
                              (trigram_file, ["trigram", "durations"])]:
        if not os.path.exists(file_path):
            with open(file_path, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(header)

# ---------- 6. Flush function with row shuffling ----------
def flush_to_csv():
    global last_flush
    with lock:
        # ---------- Bigram CSV ----------
        rows = []
        for bigram, durations in bigram_durations.items():
            # Shuffle the durations to obscure the order of typing for privacy
            shuffled_durations = durations[:]  # make a copy
            random.shuffle(shuffled_durations)  # shuffle durations for privacy
            # store as JSON for safe round-tripping
            rows.append([bigram, json.dumps(shuffled_durations)])
        
        # Shuffle the rows themselves to further prevent reconstructing typing sequences
        random.shuffle(rows)  # shuffle rows for privacy
        tmp_path = bigram_file + ".tmp"
        with open(tmp_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["bigram", "durations"])
            writer.writerows(rows)
        os.replace(tmp_path, bigram_file)  # atomic replace

        # ---------- Trigram CSV ----------
        rows = []
        for trigram, durations in trigram_durations.items():
            # Shuffle the durations to obscure typing order
            shuffled_durations = durations[:]  # copy
            random.shuffle(shuffled_durations)  # shuffle durations for privacy
            # store as JSON for safe round-tripping
            rows.append([trigram, json.dumps(shuffled_durations)])

        # Shuffle the rows for additional privacy
        random.shuffle(rows)
        tmp_path = trigram_file + ".tmp"
        with open(tmp_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["trigram", "durations"])
            writer.writerows(rows)
        os.replace(tmp_path, trigram_file)  # atomic replace

    last_flush = time.time()

# ---------- 7. Periodic flush ----------
def periodic_flush():
    while not stop_event.is_set():   # <-- stop when event is set
        if time.time() - last_flush >= 10:  # flush every 10 seconds
            flush_to_csv()
        time.sleep(1)

flush_thread = threading.Thread(target=periodic_flush, daemon=True)
flush_thread.start()

# ---------- 8. Helper: key to string ----------
def key_to_str(key):
    """Convert key to readable string, keeping Shift/Ctrl/Alt separate.
       Characters are converted to lowercase to reflect the key pressed, not resulting char."""
    if hasattr(key, 'char') and key.char is not None:
        return key.char.lower()  # normalize char to lowercase
    else:
        return f"<{key.name}>" if hasattr(key, 'name') else str(key)

# ---------- 9. Key press handler ----------
def on_press(key):
    global last_time, key_buffer, time_buffer
    current_time = time.time() * 1000  # milliseconds
    key_str = key_to_str(key)

    if last_time is not None and key_buffer:
        interval = current_time - last_time
        prev_key = key_buffer[-1]
        print(f"Duration: {prev_key} → {key_str} = {interval:.2f} ms")

        with lock:
            # Bigram
            bigram = prev_key + key_str
            bigram_durations.setdefault(bigram, []).append(interval)

            # Trigram (timestamp-based)
            if len(key_buffer) >= 2:
                trigram = key_buffer[-2] + prev_key + key_str
                trigram_duration = current_time - time_buffer[-2]  # <-- use timestamp diff
                trigram_durations.setdefault(trigram, []).append(trigram_duration)

                if (key_buffer[-2] in ("<ctrl>", "<ctrl_l>", "<ctrl_r>") and 
                    prev_key in ("<shift>", "<shift_l>", "<shift_r>") and 
                    key_str == "<esc>"):
                    print("\nDetected CTRL → SHIFT → ESC trigram. Exiting the script.\n")
                    return False

    # Update buffers
    key_buffer.append(key_str)
    time_buffer.append(current_time)   # <-- store timestamp
    if len(key_buffer) > 2:
        key_buffer.pop(0)
        time_buffer.pop(0)

    last_time = current_time

# ---------- 10. Listener ----------
init_csv_files()
print("\nLogging started. Write a CTRL → SHIFT → ESC trigram to stop.\n")

try:
    # lock already created above
    with keyboard.Listener(on_press=on_press) as listener:
        listener.join()
except KeyboardInterrupt:
    pass  # graceful stop

# Tell flush thread to stop and wait for it
stop_event.set()
flush_thread.join()

# ---------- 11. Merge all past runs into one combined file ----------
def merge_all_files(individual_runs_dir, pattern, output_dir, combined_filename):
    """
    Merge all CSV files matching the pattern into a single combined CSV.
    Each bigram/trigram gets one row with durations merged from all runs.
    Durations and rows are shuffled to improve privacy and make sharing easier.
    """

    combined_data = {}

    with lock:   # <-- added lock to avoid race with flush_to_csv()
        # Collect all files matching the pattern (e.g., bigrams_*.csv)
        for file_path in glob.glob(os.path.join(individual_runs_dir, pattern)):
            if file_path.endswith(combined_filename):  # skip the final combined file itself
                continue
            with open(file_path, newline='') as f:
                reader = csv.reader(f)
                header = next(reader, None)  # skip header
                for row in reader:
                    if len(row) != 2:
                        continue
                    key, durations_str = row
                    try:
                        # Prefer JSON, fallback to safe literal_eval for legacy rows
                        if isinstance(durations_str, str):
                            try:
                                durations = json.loads(durations_str)
                            except json.JSONDecodeError:
                                durations = ast.literal_eval(durations_str)
                        else:
                            durations = durations_str
                        if not isinstance(durations, list):
                            durations = [durations]
                    except Exception:
                        continue

                    combined_data.setdefault(key, []).extend(durations)

    # Shuffle durations per key
    rows = []
    for key, durations in combined_data.items():
        shuffled_durations = durations[:]
        random.shuffle(shuffled_durations)
        rows.append([key, json.dumps(shuffled_durations)])

    # Shuffle rows to further hide sequence info
    random.shuffle(rows)

    # Write final combined file
    combined_path = os.path.join(output_dir, combined_filename)
    tmp_path = combined_path + ".tmp"
    with open(tmp_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["key", "durations"])
        writer.writerows(rows)
    os.replace(tmp_path, combined_path)

    print(f"Combined file saved: {combined_path}")


# ---------- 12. Final save and merge ----------
flush_to_csv()

# Merge all bigram files into one big final file
merge_all_files(individual_runs, "bigrams_*.csv", output_dir, "bigrams_all.csv")

# Merge all trigram files into one big final file
merge_all_files(individual_runs, "trigrams_*.csv", output_dir, "trigrams_all.csv")
print("\nThank you for contributing your duration-data.")

_______________________________________________
Diskussion mailing list -- diskussion@neo-layout.org
To unsubscribe send an email to diskussion-le...@neo-layout.org

[Neo] Re: Skript zur Messung von Ngram-Dauern (Gedankenpausen?)

Antwort per Email an