Recently, I was trying to debug a deadlock while holding the lnet_net_lock(). That's a CPT specific lock. It gets tricky looking at the dump and trying to find the culprit that grabbed the lock. Thanks to Olaf Weber for a neat trick which provided information on which thread grabbed the lock and where in the code that happened. The patch is below. Basically, whenever I grab the lock I store the thread pointer and the program pointer of where the execution will return to after the call to grab the lock. When I unlock I clear this information. When there is a deadlock, we grab the dump and look at these fields. The one with the fields non-NULL provide the information needed.. Very neat indeed.
[ashehata@dev master]$ git diff libcfs/include/libcfs/libcfs_cpu.h diff --git a/libcfs/include/libcfs/libcfs_cpu.h b/libcfs/include/libcfs/libcfs_cpu.h index fb268a8..b0c2ee5 100644 --- a/libcfs/include/libcfs/libcfs_cpu.h +++ b/libcfs/include/libcfs/libcfs_cpu.h @@ -266,6 +266,11 @@ enum { CFS_PERCPT_LOCK_EX = -1, /* negative */ }; +struct cfs_lock_info { + void *pcl_fn_ptr; + void *pcl_thr_ptr; +}; + struct cfs_percpt_lock { /* cpu-partition-table for this lock */ struct cfs_cpt_table *pcl_cptab; @@ -273,6 +278,8 @@ struct cfs_percpt_lock { unsigned int pcl_locked; /* private lock table */ spinlock_t **pcl_locks; + + struct cfs_lock_info **pcl_locks_info; }; /* return number of private locks */ [ashehata@dev master]$ git diff libcfs/libcfs/libcfs_lock.c diff --git a/libcfs/libcfs/libcfs_lock.c b/libcfs/libcfs/libcfs_lock.c index c6ba9e7..876e8f7 100644 --- a/libcfs/libcfs/libcfs_lock.c +++ b/libcfs/libcfs/libcfs_lock.c @@ -42,6 +42,7 @@ cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) LASSERT(!pcl->pcl_locked); cfs_percpt_free(pcl->pcl_locks); + cfs_percpt_free(pcl->pcl_locks_info); LIBCFS_FREE(pcl, sizeof(*pcl)); } EXPORT_SYMBOL(cfs_percpt_lock_free); @@ -73,6 +74,13 @@ cfs_percpt_lock_create(struct cfs_cpt_table *cptab, return NULL; } + pcl->pcl_locks_info = cfs_percpt_alloc(cptab, sizeof(struct cfs_lock_info)); + if (pcl->pcl_locks_info == NULL) { + cfs_percpt_free(pcl->pcl_locks); + LIBCFS_FREE(pcl, sizeof(*pcl)); + return NULL; + } + if (keys == NULL) { CWARN("Cannot setup class key for percpt lock, you may see " "recursive locking warnings which are actually fake.\n"); @@ -115,12 +123,16 @@ __acquires(pcl->pcl_locks) if (likely(index != CFS_PERCPT_LOCK_EX)) { spin_lock(pcl->pcl_locks[index]); + pcl->pcl_locks_info[index]->pcl_fn_ptr = __builtin_return_address(0); + pcl->pcl_locks_info[index]->pcl_thr_ptr = current; return; } /* exclusive lock request */ for (i = 0; i < ncpt; i++) { spin_lock(pcl->pcl_locks[i]); + pcl->pcl_locks_info[i]->pcl_fn_ptr = __builtin_return_address(0); + pcl->pcl_locks_info[i]->pcl_thr_ptr = current; if (i == 0) { LASSERT(!pcl->pcl_locked); /* nobody should take private lock after this @@ -142,6 +154,8 @@ __releases(pcl->pcl_locks) index = ncpt == 1 ? 0 : index; if (likely(index != CFS_PERCPT_LOCK_EX)) { + pcl->pcl_locks_info[index]->pcl_fn_ptr = NULL; + pcl->pcl_locks_info[index]->pcl_thr_ptr = NULL; spin_unlock(pcl->pcl_locks[index]); return; } @@ -151,6 +165,8 @@ __releases(pcl->pcl_locks) LASSERT(pcl->pcl_locked); pcl->pcl_locked = 0; } + pcl->pcl_locks_info[i]->pcl_fn_ptr = NULL; + pcl->pcl_locks_info[i]->pcl_thr_ptr = NULL; spin_unlock(pcl->pcl_locks[i]); } } |