Recently, I was trying to debug a deadlock while holding the lnet_net_lock(). That's a CPT specific lock. It gets tricky looking at the dump and trying to find the culprit that grabbed the lock. Thanks to Olaf Weber for a neat trick which provided information on which thread grabbed the lock and where in the code that happened. The patch is below. Basically, whenever I grab the lock I store the thread pointer and the program pointer of where the execution will return to after the call to grab the lock. When I unlock I clear this information. When there is a deadlock, we grab the dump and look at these fields. The one with the fields non-NULL provide the information needed.. Very neat indeed.
[ashehata@dev master]$ git diff libcfs/include/libcfs/libcfs_cpu.h
diff --git a/libcfs/include/libcfs/libcfs_cpu.h b/libcfs/include/libcfs/libcfs_cpu.h
index fb268a8..b0c2ee5 100644
--- a/libcfs/include/libcfs/libcfs_cpu.h
+++ b/libcfs/include/libcfs/libcfs_cpu.h
@@ -266,6 +266,11 @@ enum {
CFS_PERCPT_LOCK_EX = -1, /* negative */
};
+struct cfs_lock_info {
+ void *pcl_fn_ptr;
+ void *pcl_thr_ptr;
+};
+
struct cfs_percpt_lock {
/* cpu-partition-table for this lock */
struct cfs_cpt_table *pcl_cptab;
@@ -273,6 +278,8 @@ struct cfs_percpt_lock {
unsigned int pcl_locked;
/* private lock table */
spinlock_t **pcl_locks;
+
+ struct cfs_lock_info **pcl_locks_info;
};
/* return number of private locks */
[ashehata@dev master]$ git diff libcfs/libcfs/libcfs_lock.c
diff --git a/libcfs/libcfs/libcfs_lock.c b/libcfs/libcfs/libcfs_lock.c
index c6ba9e7..876e8f7 100644
--- a/libcfs/libcfs/libcfs_lock.c
+++ b/libcfs/libcfs/libcfs_lock.c
@@ -42,6 +42,7 @@ cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
LASSERT(!pcl->pcl_locked);
cfs_percpt_free(pcl->pcl_locks);
+ cfs_percpt_free(pcl->pcl_locks_info);
LIBCFS_FREE(pcl, sizeof(*pcl));
}
EXPORT_SYMBOL(cfs_percpt_lock_free);
@@ -73,6 +74,13 @@ cfs_percpt_lock_create(struct cfs_cpt_table *cptab,
return NULL;
}
+ pcl->pcl_locks_info = cfs_percpt_alloc(cptab, sizeof(struct cfs_lock_info));
+ if (pcl->pcl_locks_info == NULL) {
+ cfs_percpt_free(pcl->pcl_locks);
+ LIBCFS_FREE(pcl, sizeof(*pcl));
+ return NULL;
+ }
+
if (keys == NULL) {
CWARN("Cannot setup class key for percpt lock, you may see "
"recursive locking warnings which are actually fake.\n");
@@ -115,12 +123,16 @@ __acquires(pcl->pcl_locks)
if (likely(index != CFS_PERCPT_LOCK_EX)) {
spin_lock(pcl->pcl_locks[index]);
+ pcl->pcl_locks_info[index]->pcl_fn_ptr = __builtin_return_address(0);
+ pcl->pcl_locks_info[index]->pcl_thr_ptr = current;
return;
}
/* exclusive lock request */
for (i = 0; i < ncpt; i++) {
spin_lock(pcl->pcl_locks[i]);
+ pcl->pcl_locks_info[i]->pcl_fn_ptr = __builtin_return_address(0);
+ pcl->pcl_locks_info[i]->pcl_thr_ptr = current;
if (i == 0) {
LASSERT(!pcl->pcl_locked);
/* nobody should take private lock after this
@@ -142,6 +154,8 @@ __releases(pcl->pcl_locks)
index = ncpt == 1 ? 0 : index;
if (likely(index != CFS_PERCPT_LOCK_EX)) {
+ pcl->pcl_locks_info[index]->pcl_fn_ptr = NULL;
+ pcl->pcl_locks_info[index]->pcl_thr_ptr = NULL;
spin_unlock(pcl->pcl_locks[index]);
return;
}
@@ -151,6 +165,8 @@ __releases(pcl->pcl_locks)
LASSERT(pcl->pcl_locked);
pcl->pcl_locked = 0;
}
+ pcl->pcl_locks_info[i]->pcl_fn_ptr = NULL;
+ pcl->pcl_locks_info[i]->pcl_thr_ptr = NULL;
spin_unlock(pcl->pcl_locks[i]);
}
}