diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h b/Modules/_testinternalcapi/clinic/test_lock.c.h
index 86875767343cd2..234eca2b8d6a67 100644
--- a/Modules/_testinternalcapi/clinic/test_lock.c.h
+++ b/Modules/_testinternalcapi/clinic/test_lock.c.h
@@ -2,35 +2,74 @@
 preserve
 [clinic start generated code]*/
 
+#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+#  include "pycore_gc.h"          // PyGC_Head
+#  include "pycore_runtime.h"     // _Py_ID()
+#endif
 #include "pycore_abstract.h"      // _PyNumber_Index()
-#include "pycore_modsupport.h"    // _PyArg_CheckPositional()
+#include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
-"benchmark_locks($module, num_threads, use_pymutex=True,\n"
-"                critical_section_length=1, time_ms=1000, /)\n"
+"benchmark_locks($module, num_threads, /, *, num_locks=1,\n"
+"                critical_section_length=1, work_outside_length=0,\n"
+"                time_ms=1000, iters_limit=0)\n"
 "--\n"
 "\n");
 
 #define _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF    \
-    {"benchmark_locks", _PyCFunction_CAST(_testinternalcapi_benchmark_locks), METH_FASTCALL, _testinternalcapi_benchmark_locks__doc__},
+    {"benchmark_locks", _PyCFunction_CAST(_testinternalcapi_benchmark_locks), METH_FASTCALL|METH_KEYWORDS, _testinternalcapi_benchmark_locks__doc__},
 
 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                        Py_ssize_t num_threads,
-                                       int use_pymutex,
+                                       Py_ssize_t num_locks,
                                        int critical_section_length,
-                                       int time_ms);
+                                       int work_outside_length, int time_ms,
+                                       Py_ssize_t iters_limit);
 
 static PyObject *
-_testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
+_testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
 {
     PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 5
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(num_locks), &_Py_ID(critical_section_length), &_Py_ID(work_outside_length), &_Py_ID(time_ms), &_Py_ID(iters_limit), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"", "num_locks", "critical_section_length", "work_outside_length", "time_ms", "iters_limit", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "benchmark_locks",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[6];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
     Py_ssize_t num_threads;
-    int use_pymutex = 1;
+    Py_ssize_t num_locks = 1;
     int critical_section_length = 1;
+    int work_outside_length = 0;
     int time_ms = 1000;
+    Py_ssize_t iters_limit = 0;
 
-    if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 4)) {
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
         goto exit;
     }
     {
@@ -45,31 +84,69 @@ _testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, Py_ss
         }
         num_threads = ival;
     }
-    if (nargs < 2) {
-        goto skip_optional;
+    if (!noptargs) {
+        goto skip_optional_kwonly;
     }
-    use_pymutex = PyObject_IsTrue(args[1]);
-    if (use_pymutex < 0) {
-        goto exit;
+    if (args[1]) {
+        {
+            Py_ssize_t ival = -1;
+            PyObject *iobj = _PyNumber_Index(args[1]);
+            if (iobj != NULL) {
+                ival = PyLong_AsSsize_t(iobj);
+                Py_DECREF(iobj);
+            }
+            if (ival == -1 && PyErr_Occurred()) {
+                goto exit;
+            }
+            num_locks = ival;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
     }
-    if (nargs < 3) {
-        goto skip_optional;
+    if (args[2]) {
+        critical_section_length = PyLong_AsInt(args[2]);
+        if (critical_section_length == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
     }
-    critical_section_length = PyLong_AsInt(args[2]);
-    if (critical_section_length == -1 && PyErr_Occurred()) {
-        goto exit;
+    if (args[3]) {
+        work_outside_length = PyLong_AsInt(args[3]);
+        if (work_outside_length == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
     }
-    if (nargs < 4) {
-        goto skip_optional;
+    if (args[4]) {
+        time_ms = PyLong_AsInt(args[4]);
+        if (time_ms == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
     }
-    time_ms = PyLong_AsInt(args[3]);
-    if (time_ms == -1 && PyErr_Occurred()) {
-        goto exit;
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[5]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        iters_limit = ival;
     }
-skip_optional:
-    return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, use_pymutex, critical_section_length, time_ms);
+skip_optional_kwonly:
+    return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, num_locks, critical_section_length, work_outside_length, time_ms, iters_limit);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=105105d759c0c271 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=c53bf7118fb334cc input=a9049054013a1b77]*/
diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c
index ded76ca9fe6819..14f502c810bf74 100644
--- a/Modules/_testinternalcapi/test_lock.c
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -2,7 +2,7 @@
 
 #include "parts.h"
 #include "pycore_lock.h"
-#include "pycore_pythread.h"      // PyThread_get_thread_ident_ex()
+#include "pycore_pythread.h" // PyThread_get_thread_ident_ex()
 
 #include "clinic/test_lock.c.h"
 
@@ -10,7 +10,7 @@
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #else
-#include <unistd.h>         // usleep()
+#include <unistd.h> // usleep()
 #endif
 
 /*[clinic input]
@@ -18,490 +18,476 @@ module _testinternalcapi
 [clinic start generated code]*/
 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=7bb583d8c9eb9a78]*/
 
-
-static void
-pysleep(int ms)
-{
+static void pysleep(int ms) {
 #ifdef MS_WINDOWS
-    Sleep(ms);
+  Sleep(ms);
 #else
-    usleep(ms * 1000);
+  usleep(ms * 1000);
 #endif
 }
 
-static PyObject *
-test_lock_basic(PyObject *self, PyObject *obj)
-{
-    PyMutex m = (PyMutex){0};
+static PyObject *test_lock_basic(PyObject *self, PyObject *obj) {
+  PyMutex m = (PyMutex){0};
 
-    // uncontended lock and unlock
-    PyMutex_Lock(&m);
-    assert(m._bits == 1);
-    PyMutex_Unlock(&m);
-    assert(m._bits == 0);
+  // uncontended lock and unlock
+  PyMutex_Lock(&m);
+  assert(m._bits == 1);
+  PyMutex_Unlock(&m);
+  assert(m._bits == 0);
 
-    Py_RETURN_NONE;
+  Py_RETURN_NONE;
 }
 
 struct test_lock2_data {
-    PyMutex m;
-    PyEvent done;
-    int started;
+  PyMutex m;
+  PyEvent done;
+  int started;
 };
 
-static void
-lock_thread(void *arg)
-{
-    struct test_lock2_data *test_data = arg;
-    PyMutex *m = &test_data->m;
-    _Py_atomic_store_int(&test_data->started, 1);
+static void lock_thread(void *arg) {
+  struct test_lock2_data *test_data = arg;
+  PyMutex *m = &test_data->m;
+  _Py_atomic_store_int(&test_data->started, 1);
 
-    PyMutex_Lock(m);
-    // gh-135641: in rare cases the lock may still have `_Py_HAS_PARKED` set
-    // (m->_bits == 3) due to bucket collisions in the parking lot hash table
-    // between this mutex and the `test_data.done` event.
-    assert(m->_bits == 1 || m->_bits == 3);
+  PyMutex_Lock(m);
+  // gh-135641: in rare cases the lock may still have `_Py_HAS_PARKED` set
+  // (m->_bits == 3) due to bucket collisions in the parking lot hash table
+  // between this mutex and the `test_data.done` event.
+  assert(m->_bits == 1 || m->_bits == 3);
 
-    PyMutex_Unlock(m);
-    assert(m->_bits == 0);
+  PyMutex_Unlock(m);
+  assert(m->_bits == 0);
 
-    _PyEvent_Notify(&test_data->done);
+  _PyEvent_Notify(&test_data->done);
 }
 
-static PyObject *
-test_lock_two_threads(PyObject *self, PyObject *obj)
-{
-    // lock attempt by two threads
-    struct test_lock2_data test_data;
-    memset(&test_data, 0, sizeof(test_data));
-
-    PyMutex_Lock(&test_data.m);
-    assert(test_data.m._bits == 1);
-
-    PyThread_start_new_thread(lock_thread, &test_data);
-
-    // wait up to two seconds for the lock_thread to attempt to lock "m"
-    int iters = 0;
-    uint8_t v;
-    do {
-        pysleep(10);  // allow some time for the other thread to try to lock
-        v = _Py_atomic_load_uint8_relaxed(&test_data.m._bits);
-        assert(v == 1 || v == 3);
-        iters++;
-    } while (v != 3 && iters < 200);
-
-    // both the "locked" and the "has parked" bits should be set
+static PyObject *test_lock_two_threads(PyObject *self, PyObject *obj) {
+  // lock attempt by two threads
+  struct test_lock2_data test_data;
+  memset(&test_data, 0, sizeof(test_data));
+
+  PyMutex_Lock(&test_data.m);
+  assert(test_data.m._bits == 1);
+
+  PyThread_start_new_thread(lock_thread, &test_data);
+
+  // wait up to two seconds for the lock_thread to attempt to lock "m"
+  int iters = 0;
+  uint8_t v;
+  do {
+    pysleep(10); // allow some time for the other thread to try to lock
     v = _Py_atomic_load_uint8_relaxed(&test_data.m._bits);
-    assert(v == 3);
+    assert(v == 1 || v == 3);
+    iters++;
+  } while (v != 3 && iters < 200);
 
-    PyMutex_Unlock(&test_data.m);
-    PyEvent_Wait(&test_data.done);
-    assert(test_data.m._bits == 0);
+  // both the "locked" and the "has parked" bits should be set
+  v = _Py_atomic_load_uint8_relaxed(&test_data.m._bits);
+  assert(v == 3);
 
-    Py_RETURN_NONE;
+  PyMutex_Unlock(&test_data.m);
+  PyEvent_Wait(&test_data.done);
+  assert(test_data.m._bits == 0);
+
+  Py_RETURN_NONE;
 }
 
 #define COUNTER_THREADS 5
 #define COUNTER_ITERS 10000
 
 struct test_data_counter {
-    PyMutex m;
-    Py_ssize_t counter;
+  PyMutex m;
+  Py_ssize_t counter;
 };
 
 struct thread_data_counter {
-    struct test_data_counter *test_data;
-    PyEvent done_event;
+  struct test_data_counter *test_data;
+  PyEvent done_event;
 };
 
-static void
-counter_thread(void *arg)
-{
-    struct thread_data_counter *thread_data = arg;
-    struct test_data_counter *test_data = thread_data->test_data;
+static void counter_thread(void *arg) {
+  struct thread_data_counter *thread_data = arg;
+  struct test_data_counter *test_data = thread_data->test_data;
 
-    for (Py_ssize_t i = 0; i < COUNTER_ITERS; i++) {
-        PyMutex_Lock(&test_data->m);
-        test_data->counter++;
-        PyMutex_Unlock(&test_data->m);
-    }
-    _PyEvent_Notify(&thread_data->done_event);
+  for (Py_ssize_t i = 0; i < COUNTER_ITERS; i++) {
+    PyMutex_Lock(&test_data->m);
+    test_data->counter++;
+    PyMutex_Unlock(&test_data->m);
+  }
+  _PyEvent_Notify(&thread_data->done_event);
 }
 
-static PyObject *
-test_lock_counter(PyObject *self, PyObject *obj)
-{
-    // Test with rapidly locking and unlocking mutex
-    struct test_data_counter test_data;
-    memset(&test_data, 0, sizeof(test_data));
+static PyObject *test_lock_counter(PyObject *self, PyObject *obj) {
+  // Test with rapidly locking and unlocking mutex
+  struct test_data_counter test_data;
+  memset(&test_data, 0, sizeof(test_data));
 
-    struct thread_data_counter thread_data[COUNTER_THREADS];
-    memset(&thread_data, 0, sizeof(thread_data));
+  struct thread_data_counter thread_data[COUNTER_THREADS];
+  memset(&thread_data, 0, sizeof(thread_data));
 
-    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
-        thread_data[i].test_data = &test_data;
-        PyThread_start_new_thread(counter_thread, &thread_data[i]);
-    }
+  for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+    thread_data[i].test_data = &test_data;
+    PyThread_start_new_thread(counter_thread, &thread_data[i]);
+  }
 
-    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
-        PyEvent_Wait(&thread_data[i].done_event);
-    }
+  for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+    PyEvent_Wait(&thread_data[i].done_event);
+  }
 
-    assert(test_data.counter == COUNTER_THREADS * COUNTER_ITERS);
-    Py_RETURN_NONE;
+  assert(test_data.counter == COUNTER_THREADS * COUNTER_ITERS);
+  Py_RETURN_NONE;
 }
 
 #define SLOW_COUNTER_ITERS 100
 
-static void
-slow_counter_thread(void *arg)
-{
-    struct thread_data_counter *thread_data = arg;
-    struct test_data_counter *test_data = thread_data->test_data;
-
-    for (Py_ssize_t i = 0; i < SLOW_COUNTER_ITERS; i++) {
-        PyMutex_Lock(&test_data->m);
-        if (i % 7 == 0) {
-            pysleep(2);
-        }
-        test_data->counter++;
-        PyMutex_Unlock(&test_data->m);
+static void slow_counter_thread(void *arg) {
+  struct thread_data_counter *thread_data = arg;
+  struct test_data_counter *test_data = thread_data->test_data;
+
+  for (Py_ssize_t i = 0; i < SLOW_COUNTER_ITERS; i++) {
+    PyMutex_Lock(&test_data->m);
+    if (i % 7 == 0) {
+      pysleep(2);
     }
-    _PyEvent_Notify(&thread_data->done_event);
+    test_data->counter++;
+    PyMutex_Unlock(&test_data->m);
+  }
+  _PyEvent_Notify(&thread_data->done_event);
 }
 
-static PyObject *
-test_lock_counter_slow(PyObject *self, PyObject *obj)
-{
-    // Test lock/unlock with occasional "long" critical section, which will
-    // trigger handoff of the lock.
-    struct test_data_counter test_data;
-    memset(&test_data, 0, sizeof(test_data));
+static PyObject *test_lock_counter_slow(PyObject *self, PyObject *obj) {
+  // Test lock/unlock with occasional "long" critical section, which will
+  // trigger handoff of the lock.
+  struct test_data_counter test_data;
+  memset(&test_data, 0, sizeof(test_data));
 
-    struct thread_data_counter thread_data[COUNTER_THREADS];
-    memset(&thread_data, 0, sizeof(thread_data));
+  struct thread_data_counter thread_data[COUNTER_THREADS];
+  memset(&thread_data, 0, sizeof(thread_data));
 
-    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
-        thread_data[i].test_data = &test_data;
-        PyThread_start_new_thread(slow_counter_thread, &thread_data[i]);
-    }
+  for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+    thread_data[i].test_data = &test_data;
+    PyThread_start_new_thread(slow_counter_thread, &thread_data[i]);
+  }
 
-    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
-        PyEvent_Wait(&thread_data[i].done_event);
-    }
+  for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+    PyEvent_Wait(&thread_data[i].done_event);
+  }
 
-    assert(test_data.counter == COUNTER_THREADS * SLOW_COUNTER_ITERS);
-    Py_RETURN_NONE;
+  assert(test_data.counter == COUNTER_THREADS * SLOW_COUNTER_ITERS);
+  Py_RETURN_NONE;
 }
 
+struct lock_state {
+  PyMutex mutex;
+  double value;
+  char padding[64];
+};
+
 struct bench_data_locks {
-    int stop;
-    int use_pymutex;
-    int critical_section_length;
-    char padding[200];
-    PyThread_type_lock lock;
-    PyMutex m;
-    double value;
-    Py_ssize_t total_iters;
+  int stop;
+  int critical_section_length;
+  int work_outside_length;
+  Py_ssize_t num_locks;
+  Py_ssize_t iters_limit;
+  struct lock_state *locks;
+  Py_ssize_t total_iters;
 };
 
 struct bench_thread_data {
-    struct bench_data_locks *bench_data;
-    Py_ssize_t iters;
-    PyEvent done;
+  struct bench_data_locks *bench_data;
+  Py_ssize_t iters;
+  PyEvent done;
+  Py_ssize_t index;
 };
 
-static void
-thread_benchmark_locks(void *arg)
-{
-    struct bench_thread_data *thread_data = arg;
-    struct bench_data_locks *bench_data = thread_data->bench_data;
-    int use_pymutex = bench_data->use_pymutex;
-    int critical_section_length = bench_data->critical_section_length;
-
-    double my_value = 1.0;
-    Py_ssize_t iters = 0;
-    while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
-        if (use_pymutex) {
-            PyMutex_Lock(&bench_data->m);
-            for (int i = 0; i < critical_section_length; i++) {
-                bench_data->value += my_value;
-                my_value = bench_data->value;
-            }
-            PyMutex_Unlock(&bench_data->m);
-        }
-        else {
-            PyThread_acquire_lock(bench_data->lock, 1);
-            for (int i = 0; i < critical_section_length; i++) {
-                bench_data->value += my_value;
-                my_value = bench_data->value;
-            }
-            PyThread_release_lock(bench_data->lock);
-        }
-        iters++;
+static void thread_benchmark_locks(void *arg) {
+  struct bench_thread_data *thread_data = arg;
+  struct bench_data_locks *bench_data = thread_data->bench_data;
+  int critical_section_length = bench_data->critical_section_length;
+  int work_outside_length = bench_data->work_outside_length;
+  Py_ssize_t iters_limit = bench_data->iters_limit;
+  Py_ssize_t num_locks = bench_data->num_locks;
+  struct lock_state *state = &bench_data->locks[thread_data->index % num_locks];
+
+  double my_value = 1.0;
+  Py_ssize_t iters = 0;
+  while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
+    if (iters_limit > 0 && iters >= iters_limit) {
+      break;
+    }
+
+    for (int i = 0; i < work_outside_length; i++) {
+      my_value += 1.0;
+    }
+
+    PyMutex_Lock(&state->mutex);
+    for (int i = 0; i < critical_section_length; i++) {
+      state->value += my_value;
+      my_value = state->value;
     }
+    PyMutex_Unlock(&state->mutex);
+    iters++;
+  }
 
-    thread_data->iters = iters;
-    _Py_atomic_add_ssize(&bench_data->total_iters, iters);
-    _PyEvent_Notify(&thread_data->done);
+  thread_data->iters = iters;
+  _Py_atomic_add_ssize(&bench_data->total_iters, iters);
+  _PyEvent_Notify(&thread_data->done);
 }
 
 /*[clinic input]
 _testinternalcapi.benchmark_locks
 
     num_threads: Py_ssize_t
-    use_pymutex: bool = True
+    /
+    *
+    num_locks: Py_ssize_t = 1
     critical_section_length: int = 1
+    work_outside_length: int = 0
     time_ms: int = 1000
-    /
+    iters_limit: Py_ssize_t = 0
 
 [clinic start generated code]*/
 
 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                        Py_ssize_t num_threads,
-                                       int use_pymutex,
+                                       Py_ssize_t num_locks,
                                        int critical_section_length,
-                                       int time_ms)
-/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
+                                       int work_outside_length, int time_ms,
+                                       Py_ssize_t iters_limit)
+/*[clinic end generated code: output=1060df8700b70a72 input=6b1638d1cfa4f152]*/
 {
-    // Run from Tools/lockbench/lockbench.py
-    // Based on the WebKit lock benchmarks:
-    // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
-    // See also https://webkit.org/blog/6161/locking-in-webkit/
-    PyObject *thread_iters = NULL;
-    PyObject *res = NULL;
-
-    struct bench_data_locks bench_data;
-    memset(&bench_data, 0, sizeof(bench_data));
-    bench_data.use_pymutex = use_pymutex;
-    bench_data.critical_section_length = critical_section_length;
-
-    bench_data.lock = PyThread_allocate_lock();
-    if (bench_data.lock == NULL) {
-        return PyErr_NoMemory();
-    }
-
-    struct bench_thread_data *thread_data = NULL;
-    thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
-    if (thread_data == NULL) {
-        PyErr_NoMemory();
-        goto exit;
-    }
-
-    thread_iters = PyList_New(num_threads);
-    if (thread_iters == NULL) {
-        goto exit;
-    }
-
-    PyTime_t start, end;
-    if (PyTime_PerfCounter(&start) < 0) {
-        goto exit;
-    }
-
+  // Run from Tools/lockbench/lockbench.py
+  // Based on the WebKit lock benchmarks:
+  // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
+  // See also https://webkit.org/blog/6161/locking-in-webkit/
+  PyObject *thread_iters = NULL;
+  PyObject *res = NULL;
+
+  struct bench_data_locks bench_data;
+  memset(&bench_data, 0, sizeof(bench_data));
+  bench_data.critical_section_length = critical_section_length;
+  bench_data.work_outside_length = work_outside_length;
+  bench_data.num_locks = num_locks;
+  bench_data.iters_limit = iters_limit;
+
+  bench_data.locks = PyMem_Calloc(num_locks, sizeof(struct lock_state));
+  if (bench_data.locks == NULL) {
+    return PyErr_NoMemory();
+  }
+
+  struct bench_thread_data *thread_data = NULL;
+  thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
+  if (thread_data == NULL) {
+    PyErr_NoMemory();
+    goto exit;
+  }
+
+  thread_iters = PyList_New(num_threads);
+  if (thread_iters == NULL) {
+    goto exit;
+  }
+
+  PyTime_t start, end;
+  if (PyTime_PerfCounter(&start) < 0) {
+    goto exit;
+  }
+
+  for (Py_ssize_t i = 0; i < num_threads; i++) {
+    thread_data[i].bench_data = &bench_data;
+    thread_data[i].index = i;
+    PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
+  }
+
+  if (iters_limit > 0) {
+    // Wait for all threads to finish their iterations
     for (Py_ssize_t i = 0; i < num_threads; i++) {
-        thread_data[i].bench_data = &bench_data;
-        PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
+      PyEvent_Wait(&thread_data[i].done);
     }
-
+    _Py_atomic_store_int(&bench_data.stop, 1);
+  } else {
     // Let the threads run for `time_ms` milliseconds
     pysleep(time_ms);
     _Py_atomic_store_int(&bench_data.stop, 1);
 
     // Wait for the threads to finish
     for (Py_ssize_t i = 0; i < num_threads; i++) {
-        PyEvent_Wait(&thread_data[i].done);
+      PyEvent_Wait(&thread_data[i].done);
     }
-
-    Py_ssize_t total_iters = bench_data.total_iters;
-    if (PyTime_PerfCounter(&end) < 0) {
-        goto exit;
-    }
-
-    // Return the total number of acquisitions and the number of acquisitions
-    // for each thread.
-    for (Py_ssize_t i = 0; i < num_threads; i++) {
-        PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
-        if (iter == NULL) {
-            goto exit;
-        }
-        PyList_SET_ITEM(thread_iters, i, iter);
+  }
+
+  Py_ssize_t total_iters = bench_data.total_iters;
+  if (PyTime_PerfCounter(&end) < 0) {
+    goto exit;
+  }
+
+  // Return the total number of acquisitions and the number of acquisitions
+  // for each thread.
+  for (Py_ssize_t i = 0; i < num_threads; i++) {
+    PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
+    if (iter == NULL) {
+      goto exit;
     }
+    PyList_SET_ITEM(thread_iters, i, iter);
+  }
 
-    assert(end != start);
-    double rate = total_iters * 1e9 / (end - start);
-    res = Py_BuildValue("(dO)", rate, thread_iters);
+  assert(end != start);
+  double rate = total_iters * 1e9 / (end - start);
+  res = Py_BuildValue("(dO)", rate, thread_iters);
 
 exit:
-    PyThread_free_lock(bench_data.lock);
-    PyMem_Free(thread_data);
-    Py_XDECREF(thread_iters);
-    return res;
+  PyMem_Free(bench_data.locks);
+  PyMem_Free(thread_data);
+  Py_XDECREF(thread_iters);
+  return res;
 }
 
-static PyObject *
-test_lock_benchmark(PyObject *module, PyObject *obj)
-{
-    // Just make sure the benchmark runs without crashing
-    PyObject *res = _testinternalcapi_benchmark_locks_impl(
-        module, 1, 1, 1, 100);
-    if (res == NULL) {
-        return NULL;
-    }
-    Py_DECREF(res);
-    Py_RETURN_NONE;
+static PyObject *test_lock_benchmark(PyObject *module, PyObject *obj) {
+  // Just make sure the benchmark runs without crashing
+  PyObject *res =
+      _testinternalcapi_benchmark_locks_impl(module, 1, 1, 1, 0, 100, 0);
+  if (res == NULL) {
+    return NULL;
+  }
+  Py_DECREF(res);
+  Py_RETURN_NONE;
 }
 
-static int
-init_maybe_fail(void *arg)
-{
-    int *counter = (int *)arg;
-    (*counter)++;
-    if (*counter < 5) {
-        // failure
-        return -1;
-    }
-    assert(*counter == 5);
-    return 0;
+static int init_maybe_fail(void *arg) {
+  int *counter = (int *)arg;
+  (*counter)++;
+  if (*counter < 5) {
+    // failure
+    return -1;
+  }
+  assert(*counter == 5);
+  return 0;
 }
 
-static PyObject *
-test_lock_once(PyObject *self, PyObject *obj)
-{
-    _PyOnceFlag once = {0};
-    int counter = 0;
-    for (int i = 0; i < 10; i++) {
-        int res = _PyOnceFlag_CallOnce(&once, init_maybe_fail, &counter);
-        if (i < 4) {
-            assert(res == -1);
-        }
-        else {
-            assert(res == 0);
-            assert(counter == 5);
-        }
+static PyObject *test_lock_once(PyObject *self, PyObject *obj) {
+  _PyOnceFlag once = {0};
+  int counter = 0;
+  for (int i = 0; i < 10; i++) {
+    int res = _PyOnceFlag_CallOnce(&once, init_maybe_fail, &counter);
+    if (i < 4) {
+      assert(res == -1);
+    } else {
+      assert(res == 0);
+      assert(counter == 5);
     }
-    Py_RETURN_NONE;
+  }
+  Py_RETURN_NONE;
 }
 
 struct test_rwlock_data {
-    Py_ssize_t nthreads;
-    _PyRWMutex rw;
-    PyEvent step1;
-    PyEvent step2;
-    PyEvent step3;
-    PyEvent done;
+  Py_ssize_t nthreads;
+  _PyRWMutex rw;
+  PyEvent step1;
+  PyEvent step2;
+  PyEvent step3;
+  PyEvent done;
 };
 
-static void
-rdlock_thread(void *arg)
-{
-    struct test_rwlock_data *test_data = arg;
+static void rdlock_thread(void *arg) {
+  struct test_rwlock_data *test_data = arg;
 
-    // Acquire the lock in read mode
-    _PyRWMutex_RLock(&test_data->rw);
-    PyEvent_Wait(&test_data->step1);
-    _PyRWMutex_RUnlock(&test_data->rw);
+  // Acquire the lock in read mode
+  _PyRWMutex_RLock(&test_data->rw);
+  PyEvent_Wait(&test_data->step1);
+  _PyRWMutex_RUnlock(&test_data->rw);
 
-    _PyRWMutex_RLock(&test_data->rw);
-    PyEvent_Wait(&test_data->step3);
-    _PyRWMutex_RUnlock(&test_data->rw);
+  _PyRWMutex_RLock(&test_data->rw);
+  PyEvent_Wait(&test_data->step3);
+  _PyRWMutex_RUnlock(&test_data->rw);
 
-    if (_Py_atomic_add_ssize(&test_data->nthreads, -1) == 1) {
-        _PyEvent_Notify(&test_data->done);
-    }
+  if (_Py_atomic_add_ssize(&test_data->nthreads, -1) == 1) {
+    _PyEvent_Notify(&test_data->done);
+  }
 }
-static void
-wrlock_thread(void *arg)
-{
-    struct test_rwlock_data *test_data = arg;
+static void wrlock_thread(void *arg) {
+  struct test_rwlock_data *test_data = arg;
 
-    // First acquire the lock in write mode
-    _PyRWMutex_Lock(&test_data->rw);
-    PyEvent_Wait(&test_data->step2);
-    _PyRWMutex_Unlock(&test_data->rw);
+  // First acquire the lock in write mode
+  _PyRWMutex_Lock(&test_data->rw);
+  PyEvent_Wait(&test_data->step2);
+  _PyRWMutex_Unlock(&test_data->rw);
 
-    if (_Py_atomic_add_ssize(&test_data->nthreads, -1) == 1) {
-        _PyEvent_Notify(&test_data->done);
-    }
+  if (_Py_atomic_add_ssize(&test_data->nthreads, -1) == 1) {
+    _PyEvent_Notify(&test_data->done);
+  }
 }
 
-static void
-wait_until(uintptr_t *ptr, uintptr_t value)
-{
-    // wait up to two seconds for *ptr == value
-    int iters = 0;
-    uintptr_t bits;
-    do {
-        pysleep(10);
-        bits = _Py_atomic_load_uintptr(ptr);
-        iters++;
-    } while (bits != value && iters < 200);
+static void wait_until(uintptr_t *ptr, uintptr_t value) {
+  // wait up to two seconds for *ptr == value
+  int iters = 0;
+  uintptr_t bits;
+  do {
+    pysleep(10);
+    bits = _Py_atomic_load_uintptr(ptr);
+    iters++;
+  } while (bits != value && iters < 200);
 }
 
-static PyObject *
-test_lock_rwlock(PyObject *self, PyObject *obj)
-{
-    struct test_rwlock_data test_data = {.nthreads = 3};
+static PyObject *test_lock_rwlock(PyObject *self, PyObject *obj) {
+  struct test_rwlock_data test_data = {.nthreads = 3};
 
-    _PyRWMutex_Lock(&test_data.rw);
-    assert(test_data.rw.bits == 1);
+  _PyRWMutex_Lock(&test_data.rw);
+  assert(test_data.rw.bits == 1);
 
-    _PyRWMutex_Unlock(&test_data.rw);
-    assert(test_data.rw.bits == 0);
+  _PyRWMutex_Unlock(&test_data.rw);
+  assert(test_data.rw.bits == 0);
 
-    // Start two readers
-    PyThread_start_new_thread(rdlock_thread, &test_data);
-    PyThread_start_new_thread(rdlock_thread, &test_data);
+  // Start two readers
+  PyThread_start_new_thread(rdlock_thread, &test_data);
+  PyThread_start_new_thread(rdlock_thread, &test_data);
 
-    // wait up to two seconds for the threads to attempt to read-lock "rw"
-    wait_until(&test_data.rw.bits, 8);
-    assert(test_data.rw.bits == 8);
+  // wait up to two seconds for the threads to attempt to read-lock "rw"
+  wait_until(&test_data.rw.bits, 8);
+  assert(test_data.rw.bits == 8);
 
-    // start writer (while readers hold lock)
-    PyThread_start_new_thread(wrlock_thread, &test_data);
-    wait_until(&test_data.rw.bits, 10);
-    assert(test_data.rw.bits == 10);
+  // start writer (while readers hold lock)
+  PyThread_start_new_thread(wrlock_thread, &test_data);
+  wait_until(&test_data.rw.bits, 10);
+  assert(test_data.rw.bits == 10);
 
-    // readers release lock, writer should acquire it
-    _PyEvent_Notify(&test_data.step1);
-    wait_until(&test_data.rw.bits, 3);
-    assert(test_data.rw.bits == 3);
+  // readers release lock, writer should acquire it
+  _PyEvent_Notify(&test_data.step1);
+  wait_until(&test_data.rw.bits, 3);
+  assert(test_data.rw.bits == 3);
 
-    // writer releases lock, readers acquire it
-    _PyEvent_Notify(&test_data.step2);
-    wait_until(&test_data.rw.bits, 8);
-    assert(test_data.rw.bits == 8);
+  // writer releases lock, readers acquire it
+  _PyEvent_Notify(&test_data.step2);
+  wait_until(&test_data.rw.bits, 8);
+  assert(test_data.rw.bits == 8);
 
-    // readers release lock again
-    _PyEvent_Notify(&test_data.step3);
-    wait_until(&test_data.rw.bits, 0);
-    assert(test_data.rw.bits == 0);
+  // readers release lock again
+  _PyEvent_Notify(&test_data.step3);
+  wait_until(&test_data.rw.bits, 0);
+  assert(test_data.rw.bits == 0);
 
-    PyEvent_Wait(&test_data.done);
-    Py_RETURN_NONE;
+  PyEvent_Wait(&test_data.done);
+  Py_RETURN_NONE;
 }
 
-static PyObject *
-test_lock_recursive(PyObject *self, PyObject *obj)
-{
-    _PyRecursiveMutex m = (_PyRecursiveMutex){0};
-    assert(!_PyRecursiveMutex_IsLockedByCurrentThread(&m));
+static PyObject *test_lock_recursive(PyObject *self, PyObject *obj) {
+  _PyRecursiveMutex m = (_PyRecursiveMutex){0};
+  assert(!_PyRecursiveMutex_IsLockedByCurrentThread(&m));
 
-    _PyRecursiveMutex_Lock(&m);
-    assert(m.thread == PyThread_get_thread_ident_ex());
-    assert(PyMutex_IsLocked(&m.mutex));
-    assert(m.level == 0);
+  _PyRecursiveMutex_Lock(&m);
+  assert(m.thread == PyThread_get_thread_ident_ex());
+  assert(PyMutex_IsLocked(&m.mutex));
+  assert(m.level == 0);
 
-    _PyRecursiveMutex_Lock(&m);
-    assert(m.level == 1);
-    _PyRecursiveMutex_Unlock(&m);
+  _PyRecursiveMutex_Lock(&m);
+  assert(m.level == 1);
+  _PyRecursiveMutex_Unlock(&m);
 
-    _PyRecursiveMutex_Unlock(&m);
-    assert(m.thread == 0);
-    assert(!PyMutex_IsLocked(&m.mutex));
-    assert(m.level == 0);
+  _PyRecursiveMutex_Unlock(&m);
+  assert(m.thread == 0);
+  assert(!PyMutex_IsLocked(&m.mutex));
+  assert(m.level == 0);
 
-    Py_RETURN_NONE;
+  Py_RETURN_NONE;
 }
 
 static PyMethodDef test_methods[] = {
@@ -509,19 +495,17 @@ static PyMethodDef test_methods[] = {
     {"test_lock_two_threads", test_lock_two_threads, METH_NOARGS},
     {"test_lock_counter", test_lock_counter, METH_NOARGS},
     {"test_lock_counter_slow", test_lock_counter_slow, METH_NOARGS},
-    _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF
-    {"test_lock_benchmark", test_lock_benchmark, METH_NOARGS},
+    _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF{
+        "test_lock_benchmark", test_lock_benchmark, METH_NOARGS},
     {"test_lock_once", test_lock_once, METH_NOARGS},
     {"test_lock_rwlock", test_lock_rwlock, METH_NOARGS},
     {"test_lock_recursive", test_lock_recursive, METH_NOARGS},
     {NULL, NULL} /* sentinel */
 };
 
-int
-_PyTestInternalCapi_Init_Lock(PyObject *mod)
-{
-    if (PyModule_AddFunctions(mod, test_methods) < 0) {
-        return -1;
-    }
-    return 0;
+int _PyTestInternalCapi_Init_Lock(PyObject *mod) {
+  if (PyModule_AddFunctions(mod, test_methods) < 0) {
+    return -1;
+  }
+  return 0;
 }
diff --git a/Tools/lockbench/lockbench.py b/Tools/lockbench/lockbench.py
index 9833d703e00cbb..c7cd75b37b20cb 100644
--- a/Tools/lockbench/lockbench.py
+++ b/Tools/lockbench/lockbench.py
@@ -1,8 +1,6 @@
-# Measure the performance of PyMutex and PyThread_type_lock locks
+# Measure the performance of PyMutex
 # with short critical sections.
 #
-# Usage: python Tools/lockbench/lockbench.py [CRITICAL_SECTION_LENGTH]
-#
 # How to interpret the results:
 #
 # Acquisitions (kHz): Reports the total number of lock acquisitions in
@@ -18,36 +16,49 @@
 # lock.
 # See https://en.wikipedia.org/wiki/Fairness_measure#Jain's_fairness_index
 
+import argparse
 from _testinternalcapi import benchmark_locks
-import sys
-
-# Max number of threads to test
-MAX_THREADS = 10
-
-# How much "work" to do while holding the lock
-CRITICAL_SECTION_LENGTH = 1
-
 
 def jains_fairness(values):
     # Jain's fairness index
     # See https://en.wikipedia.org/wiki/Fairness_measure
+    if not values:
+        return 0.0
     return (sum(values) ** 2) / (len(values) * sum(x ** 2 for x in values))
 
 def main():
-    print("Lock Type           Threads           Acquisitions (kHz)   Fairness")
-    for lock_type in ["PyMutex", "PyThread_type_lock"]:
-        use_pymutex = (lock_type == "PyMutex")
-        for num_threads in range(1, MAX_THREADS + 1):
-            acquisitions, thread_iters = benchmark_locks(
-                num_threads, use_pymutex, CRITICAL_SECTION_LENGTH)
+    parser = argparse.ArgumentParser(description="Measure the performance of PyMutex")
+    parser.add_argument("threads", type=int, nargs="?", default=1,
+                        help="Number of threads")
+    parser.add_argument("--num-locks", type=int, default=1,
+                        help="Number of locks")
+    parser.add_argument("--critical-section", type=int, default=1,
+                        help="Work inside the lock")
+    parser.add_argument("--work-outside", type=int, default=0,
+                        help="Work outside the lock")
+    parser.add_argument("--time", type=int, default=1000,
+                        help="Benchmark duration in milliseconds")
+    parser.add_argument("--total-iters", type=int, default=0,
+                        help="Fixed number of iterations per thread")
+
+    args = parser.parse_args()
 
-            acquisitions /= 1000  # report in kHz for readability
-            fairness = jains_fairness(thread_iters)
+    acquisitions, thread_iters = benchmark_locks(
+        args.threads,
+        num_locks=args.num_locks,
+        critical_section_length=args.critical_section,
+        work_outside_length=args.work_outside,
+        time_ms=args.time,
+        iters_limit=args.total_iters
+    )
 
-            print(f"{lock_type: <20}{num_threads: <18}{acquisitions: >5.0f}{fairness: >20.2f}")
+    acquisitions /= 1000  # report in kHz for readability
+    fairness = jains_fairness(thread_iters)
 
+    print(f"Threads:            {args.threads}")
+    print(f"Locks:              {args.num_locks}")
+    print(f"Acquisitions (kHz): {acquisitions: >5.0f}")
+    print(f"Fairness:           {fairness: >20.2f}")
 
 if __name__ == "__main__":
-    if len(sys.argv) > 1:
-        CRITICAL_SECTION_LENGTH = int(sys.argv[1])
     main()