States:

  • LNET_PEER_STATE_INIT

  • LNET_PEER_STATE_VERIFY

  • LNET_PEER_STATE_ACTIVE

  • LNET_PEER_STATE_WAIT_PING_RESPONSE

  • LNET_PEER_STATE_PUSH_SENT

Events

  • LNET_EVENT_RECV_ACK

  • LNET_EVENT_RECV_PING_PUSH

  • LNET_EVENT_RECV_PING_REPLY

  • LNET_EVENT_SEND

  • LNET_EVENT_DLC_ADD_PEER_NI

  • LNET_EVENT_DLC_DEL_PEER_NI
  • LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATE

Receiving

lnet_parse()

  • Do error checking
  • If routing then update the status of the local Network Interface
  • If the message is not for the local node, then it should be routed, so perform appropriate checking
  • allocate an lnet_msg_t and populate it with the information passed in.
    •     msg->msg_type = type;
          msg->msg_private = private;
          msg->msg_receiving = 1;
          msg->msg_rdma_get = rdma_req;
          msg->msg_len = msg->msg_wanted = payload_length;
          msg->msg_offset = 0;
          msg->msg_hdr = *hdr;
          /* for building message event */
          msg->msg_from = from_nid;
          if (!for_me) {
              msg->msg_target.pid    = dest_pid;
              msg->msg_target.nid    = dest_nid;
              msg->msg_routing    = 1;
          } else {
              /* convert common msg->hdr fields to host byteorder */
              msg->msg_hdr.type    = type;
              msg->msg_hdr.src_nid    = src_nid;
              msg->msg_hdr.src_pid    = le32_to_cpu(msg->msg_hdr.src_pid);
              msg->msg_hdr.dest_nid    = dest_nid;
              msg->msg_hdr.dest_pid    = dest_pid;
              msg->msg_hdr.payload_length = payload_length;
          }
  • Now we need to call into the peer module. This should be the end of lnet_parse(). The following code will be moved to the peer module
    •     lnet_net_lock(cpt);
          rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
          if (rc != 0) {
              lnet_net_unlock(cpt);
              CERROR("%s, src %s: Dropping %s "
                     "(error %d looking up sender)\n",
                     libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
                     lnet_msgtyp2str(type), rc);
              lnet_msg_free(msg);
              goto drop;
          }
          if (lnet_isrouter(msg->msg_rxpeer)) {
              lnet_peer_set_alive(msg->msg_rxpeer);
              if (avoid_asym_router_failure &&
                  LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
                  /* received a remote message from router, update
                   * remote NI status on this router.
                   * NB: multi-hop routed message will be ignored.
                   */
                  lnet_router_ni_update_locked(msg->msg_rxpeer,
                                   LNET_NIDNET(src_nid));
              }
          }
          lnet_msg_commit(msg, cpt);
          /* message delay simulation */
          if (unlikely(!list_empty(&the_lnet.ln_delay_rules) &&
                   lnet_delay_rule_match_locked(hdr, msg))) {
              lnet_net_unlock(cpt);
              return 0;
          }
          if (!for_me) {
              rc = lnet_parse_forward_locked(ni, msg);
              lnet_net_unlock(cpt);
              if (rc < 0)
                  goto free_drop;
              if (rc == LNET_CREDIT_OK) {
                  lnet_ni_recv(ni, msg->msg_private, msg, 0,
                           0, payload_length, payload_length);
              }
              return 0;
          }
          lnet_net_unlock(cpt);
          /* AMIR:
           * lnet_peer_recv_message(ni, msg)
           */
          rc = lnet_parse_local(ni, msg);
          if (rc != 0)
              goto free_drop;
          return 0;
    • Instead call into the peer module with: lnet_peer_recv_data()

lnet_peer_recv_data()

  • lnet_net_lock()
  • find a peer_ni
  • Perform the bit of logic we moved out of lnet_parse()
  • If peer_ni already exists, then check if it's detached or has a parent. If peer_ni is not already in our db then create one and by default it's detached.
    • if it's detached then check what kind of message it received
      • If it's a PING PUSH, then the peer is trying to tell us about the rest of his interfaces
        • We can check ahead of calling lnet_parse_put() by looking at the hdr->msg.put.match_bits. That should tell us what specific message was sent to us
        • Create a buffer to put the push data in.
        • Create the peer/peer_net structures and link them
        • call lnet_parse_put()
          • provide and eq callback to put the ping info in.
        • Feed peer LNET_EVENT_RECV_PING_PUSH event
      • If it's any other type of message then perform similar operation to lnet_parse_local()
        • The peer is still detached at this point
        • int
          lnet_parse_local(lnet_ni_t *ni, lnet_msg_t *msg)
          {
              int    rc;
              switch (msg->msg_type) {
              case LNET_MSG_ACK:
                  rc = lnet_parse_ack(ni, msg);
                  break;
              case LNET_MSG_PUT:
                  rc = lnet_parse_put(ni, msg);
                  break;
              case LNET_MSG_GET:
                  rc = lnet_parse_get(ni, msg, msg->msg_rdma_get);
                  break;
              case LNET_MSG_REPLY:
                  rc = lnet_parse_reply(ni, msg);
                  break;
              default: /* prevent an unused label if !kernel */
                  LASSERT(0);
                  return -EPROTO;
              }
              LASSERT(rc == 0 || rc == ENOENT);
              return rc;
          }
    • If one already exists and it's part of a peer
      • if msg->msg_type == LNET_MSG_PUT then
        • if this is a PING_PUSH, same check as above.
          • Create a buffer to put the push data in
          • call lnet_parse_put()
          • Feed the peer LNET_EVENT_RECV_PING_PUSH event
      • if msg->msg_type == LNET_MSG_REPLY then
        • if this is a PING_REPLY
          • if peer_state == LNET_PEER_STATE_WAIT_PING_REPLY
            • The reason for the state check here is to avoid creating a buffer for the reply when we don't want to
              • An alternative is to have the buffer created when the ping is sent, but I like this better because it's symmetrical with dealing with receiving a PUSH.
            • Create a buffer to hold the reply in
            • call lnet_parse_reply()
              • Provide eq callback to copy ping info data in.
            • Feed the peer LNET_EVENT_RECV_PING_REPLY event
      • if msg->msg_type == LNET_MSG_ACK
        • call lnet_parse_ack()
          • No need for an eq callback here
        • Feed the peer LNET_EVENT_RECV_ACK event
      • else
        • perform similar logic as lnet_parse_local()

Notes

For background information.

By the time we return back from lnet_parse_<>() we should have

  • read the message into the MD
  • called the eq_callback
  • detached the MD
  • freed the message

Given this order, we can do normal processing, and once we return back from this processing, we should have the information we need copied into a local buffer, which we can then pass to the Dynamic Discovery thread to process.

High-level call flow

lnet_parse_put()
	lnet_ptl_match_md()
	case LNET_MATCHMD_OK
		lnet_recv_put()
			lnet_ni_recv()
				lnd_recv()
						lnet_finalize()
						lnet_msg_detach_md()
							lnet_eq_enqueue_event()
									eq->eq_callback(ev);

Sending

lnet_peer_send_msg()

This API will replace lnet_send() and will exist in the peer module.

This API is going to trigger:

  • Discovery of peer if it needs to
  • Determine the local/remote pathway
  • send the message

The algorithm will be similar to:

  • find or create peer_ni with msg->msg_target.nid under lock
    • if peer_ni is detached
      • create peer_net
      • add peer_ni to peer_net
      • create peer
      • add peer_net to peer

      • return peer

    • if peer_ni is part of a peer return that peer
    • Feed LNET_EVENT_SEND to the peer

DLC

lnet_peer_add_peer_ni()

  • api_mutex_lock()
    • There is the concept of primary NID.
      • Find or create the peer_ni with the primary NID
      • if a peer_ni is found and it's attached then
        • ensure that the peer NIDs provided are all unique to this peer.
        • return peer
      • if a peer_ni is detached then
        • create peer_net
        • add peer_ni to peer_net
        • create peer
        • add peer_net to peer
        • return peer
    • Feed peer LNET_EVENT_DLC_ADD_PEER_NI
  • api_mutex_unlock()

lnet_peer_del_per_ni()

  • api_mutex_lock()
    • There is the concept of primary NID.
      • Find the peer_ni with the primary NID
      • if a peer_ni is found and it's attached then
        • ensure that the peer NIDs provided are all unique to this peer.
        • return peer
      • if a peer_ni is detached then
        • Report error.
    • Feed peer LNET_EVENT_DLC_ADD_PEER_NI
  • api_mutex_unlock()

lnet_peer_local_ni_cfg_change()

  • api_mutex_lock()
    • for each peer on the peer list
      • Feed peer LNET_EVENT_DLC_LOCAL_CFG_UPDATE
  • api_mutex_unlock()

 

FSM Table

Note that when a peer is being worked on it gets locked so that no other thread can change it. This prevents unexpected state transitions.

Calling the FSM Action functions

lnet_peer_lock()
action_fn_locked = lookup_peer_action_function(peer, event);
action_fn_locked()
lnet_peer_unlock()

[Apologies for intruding here. This note outgrew the limited space available in the comment column.

The "traditional" implementation of an FSM in C is to have a function per event type, with a switch inside on the FSM state of the object, and actions again being functions (but often open-coded if simple and unique). Example:

// One for each event type
int lnet_peer_event_recv_push_ack(peer, args)
{
	recv_push_ack_prologue;
	switch (peer->state) {
	case LNET_PEER_STATE_INIT: 
         peer_state_init_event_recv_push_ack_action(peer, args);
         peer->state = newstate;
         break;
    case LNET_PEER_STATE_VERIFY:
         peer_state_verify_event_recv_push_ack_action(peer, args);
         peer->state = newstate;
         break;
    ...
    }
    recv_push_ack_epilogue;
    return ...;
}

int lnet_peer_event_recv_push_mesg(peer, args) ...
int lnet_peer_event_recv_ping_mesg(peer, args) ...
int lnet_peer_event_recv_ping_reply(peer, args) ...

It is certainly possible to replace the switch with a lookup in an array of function pointers (one such array per event type with entries for each state). But it is not unusual that the bulk of the event handler is the common code (the prologue/epilogue code above) with only a small amount of code specific to the state. In that case going through function pointers may actually obscure code flow and logic more than it clarifies.

It is also possible to implement a generic event handler:

int lnet_peer_event(peer, event, args)
{
	event_prologue;
	peer_lookup_action(peer, event)(peer, args);
    event_epilogue;
    return ...;
}
// calls now change like this:
	lnet_peer_recv_push_mesg_event(peer, args);
// becomes
	lnet_peer_event(peer, LNET_PEER_EVENT_RECV_PUSH_MESG, args);

This I dislike for several reasons. It gives us a rather odd bounce from event-specific code through generic code to (again) event-specific code. The actual work still happens in the individual action functions, and these will typically be unique to each <state,event> pair. While this makes it clear that we're dealing with a state machine, in terms of being able to follow the logic of the code I see no gain, and actually expect a loss: following code flow through function pointers is always more difficult that tracing a simple call. Moreover, each event type carries its own unique arguments (the push message with its data versus the ping reply with its data versus the peer ni that was added or removed) and now these have to be force-fitted through a single uniform interface to fit the signature of lnet_peer_event() and/or the common signatures of the action functions. That's effectively a typeless interface, and those also harm the ability to comprehend the code, and hamper the compiler's ability to detect some some forms of abuse, like passing the wrong type of parameters.]

LNET_PEER_STATE_INIT

EventAction
LNET_EVENT_RECV_ACKNo-op.
LNET_EVENT_RECV_PING_PUSH
  • Notify DD thread that a ping push was received
  • unlock_peer 
  • wait for the push to finish processing
  • set peer state: LNET_PEER_STATE_ACTIVE
  • lock_peer

on the DD thread

  • lock_peer
    • compare push sequence number with last one received.
    • if the push data has an older sequence number ignore.
    • if it has a newer sequence number process and store sequence number.
    • for each peer_nid in the peer_info
      • create a peer_ni
      • create a peer_net if different
      • add peer_ni to peer_net
      • add peer_net to peer
      • Do a search/merge for possible peer_ni duplicates
  • unlock_peer
LNET_EVENT_RECV_PING_REPLY

Impossible Event. Log error

LNET_EVENT_SEND
  • add peer to the peer list if it's not already added.
  • Set state of peer to LNET_PEER_STATE_DISCOVERY if DD is on LNET_PEER_STATE_ACTIVE if not
  • if DD is on
    • Signal DD thread that we need to do discovery using peer_nid
    • unlock_peer
    • Wait until you're woken up by the DD thread
    • lock_peer
      • When you're woken up check that the discovery was successful
      • set peer sate to LNET_PEER_STATE_ACTIVE
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock
    • Run the selection algorithm to determine the pathway
    • Send the message.

On the DD thread

  • lock_peer
    • set the state of the peer to LNET_PEER_STATE_WAIT_PING_RESPONSE
    • send lnet_ping
  • unlock_peer
LNET_EVENT_DLC_ADD_PEER_NI
  • for each peer_nid in list
    • create the peer_nets if need be
    • add peer_NI to peer nets
    • add peer to the list of peers if not already added.
  • If DD is ON
    • set state to LNET_PEER_STATE_VERIFY
  • else
    • set state to LNET_PEER_STATE_ACTIVE
LNET_EVENT_DLC_DEL_PEER_NINo-op. The lookup for the peer should return NULL
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATENo-op. This peer won't be on the peer list yet.

LNET_PEER_STATE_WAIT_PING_RESPONSE

EventActionNotes
LNET_EVENT_RECV_ACKNo-op 
LNET_EVENT_RECV_PING_PUSH
  • unexpected even on a peer waiting for the ping response, we drop and ignore.
 
LNET_EVENT_RECV_PING_REPLY
  • for each peer_nid in the peer_info (TODO: might do that in the DD thread)
    • create a peer_ni if not already there in detached state
    • create a peer_net if different
    • add peer_ni to peer_net
    • add peer_net to peer
    • Do a search/merge for possible peer_ni duplicates
  • if DD is ON && Peer is Multi-rail capable
    • signal DD thread to send a PING PUSH
    • unlock_peer
  • else
    • set peer state LNET_PEER_STATE_ACTIVE

On DD thread

  • send PING PUSH
  • set peer state: LNET_PEER_STATE_PUSH_SENT

Not sure what it means to enlarge the MD data if it's not enough to receive the data.

this can be done when you receive the initial message before you match the md. At this point you already know the size of the data and can set the MD size appropriately.

When processing PING REPLY or PING PUSH the following scenarios are possible:

  • Peer NI doesn't exist
  • Peer NI exists in detached state
  • Peer NI exists but attached to a different peer
    • This could be as a result of misconfiguration
    • This could be a duplicate
    • Since DD provides the latest view of the peer
      • Move the peer NI from the older peer to the newer peer
      • If after this process, the older peer has no more peer_nis it should be removed.
  • Peer contains peer NIs not in the ping_info_t
    • In this case these are outdated peer_nis so detach them.

 

LNET_EVENT_SEND
  • unlock_peer
  • Wait on the same peer mechanism for the discovery to complete
  • lock_peer
  • Verify that state is in LNET_PEER_STATE_ACTIVE
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock()
    • Run the selection algorithm to determine the pathway
    • Send the message.
 
LNET_EVENT_DLC_ADD_PEER_NI
  • Reject request -EBUSY
 
LNET_EVENT_DLC_DEL_PEER_NI
  • Reject request -EBUSY
 
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATE
  • No-op. A ping push will be sent when the ping reply is received
 

LNET_PEER_STATE_ACTIVE

EventActionNotes
LNET_EVENT_RECV_ACKNo-op. Log error 
LNET_EVENT_RECV_PING_PUSH
  • Notify DD thread that a ping push was received
  • unlock_peer
  • wait for the DD thread to notify you that it's completed ping push processing
  • lock_peer

on the DD thread (TODO: can be done outside the DD thread)

    • lock_peer
      • compare push sequence number with last one received.
      • if the push data has an older sequence number ignore.
      • if it has a newer sequence number process and store sequence number.
      • for each peer_nid in the peer_info
        • create a peer_ni
        • create a peer_net if different
        • add peer_ni to peer_net
        • add peer_net to peer
        • Do a search/merge for possible peer_ni duplicates
    • unlock_peer
 
LNET_EVENT_RECV_PING_REPLY
  • No-op. Log error
 
LNET_EVENT_SEND
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock()
    • Run the selection algorithm to determine the pathway
    • Send the message.
 
LNET_EVENT_DLC_ADD_PEER_NI
  • for each peer_nid in list
    • if that peer_nid is already in the peer skip it
    • create the peer_nets if need be
    • add peer_NI to peer nets
    • add peer to the list of peers if not already added

the peer_nis are added to the peer. Previous to entering into the FSM we've already verified that none of these peers invalidates the configuration. I don't see a need to keep track whether a peer was configured via DLC or not.

Another option is whenever a peer NI is added to a peer, from DLC and discovery is on then we would want to make sure that this NI is "real". So we can go to the DISCOVER state and initiate a discovery round. Although, I think is too much of a complication.

LNET_EVENT_DLC_DEL_PEER_NI
  • for each peer_nid in list
    • detach peer and keep around as detached
    • if peer_net is empty delete
    • if peer is empty delete
 
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATE
  • if DD is ON and Peer is Multi-rail capable
    • set peer state: LNET_PEER_STATE_PUSH_SENT
    • signal DD thread to send the PING PUSH
    • unlock_peer

on DD thread

  • send PING PUSH on primary NID
 

LNET_PEER_STATE_PUSH_SENT

EventAction
LNET_EVENT_RECV_ACK
  • set peer state to LNET_PEER_STATE_ACTIVE
LNET_EVENT_RECV_PING_PUSH
  • Notify DD thread that a ping push was received
  • unlock_peer
  • wait for the DD thread to notify you that ping push processing is complete

on the DD thread

  • lock_peer
    • compare push sequence number with last one received.
    • if the push data has an older sequence number ignore.
    • if it has a newer sequence number process and store sequence number.
    • for each peer_nid in the peer_info
      • create a peer_ni
      • create a peer_net if different
      • add peer_ni to peer_net
      • add peer_net to peer
      • Do a search/merge for possible peer_ni duplicates
  • unlock_peer
LNET_EVENT_RECV_PING_REPLY
  • No-op. Log error
LNET_EVENT_SEND
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock()
    • Run the selection algorithm to determine the pathway
    • Send the message.
LNET_EVENT_DLC_ADD_PEER_NI
  • Reject with -EBUSY
LNET_EVENT_DLC_DEL_PEER_NI
  • Reject with -EBUSY
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATE
  • if DD is ON
    • set peer state: LNET_PEER_STATE_PUSH_SENT
    • signal DD thread to send the PING PUSH
    • unlock_peer

on DD thread

  • send PING PUSH on primary NID

LNET_PEER_STATE_DISCOVERY

EventAction
LNET_EVENT_RECV_ACK
  • Unexpected ACK. output error
LNET_EVENT_RECV_PING_PUSH
  • Unexpected PUSH. Output error
LNET_EVENT_RECV_PING_REPLY
  • No-op. Log error
LNET_EVENT_SEND
  • unlock_peer
  • Wait on the same peer mechanism for the discovery to complete
  • lock_peer
  • Verify that state is in LNET_PEER_STATE_ACTIVE
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock()
    • Run the selection algorithm to determine the pathway
    • Send the message.
LNET_EVENT_DLC_ADD_PEER_NI
  • Reject with -EBUSY
LNET_EVENT_DLC_DEL_PEER_NI
  • Reject with -EBUSY
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATE
  • No-op. You're starting the discovery process, so you'll end up sending a push later anyway

LNET_PEER_STATE_VERIFY

EventAction
LNET_EVENT_RECV_ACK
  • Unexpected ACK. output error
LNET_EVENT_RECV_PING_PUSH
  • Unexpected PUSH. Output errro
LNET_EVENT_RECV_PING_REPLY
  • No-op. Log error
LNET_EVENT_SEND
  • Set state of peer to LNET_PEER_STATE_DISCOVERY if DD is on LNET_PEER_STATE_ACTIVE if not
  • if DD is on
    • Signal DD thread that we need to do discovery using peer_nid
    • unlock_peer
    • Wait until you're woken up by the DD thread
    • lock_peer
      • When you're woken up check that the discovery was successful
      • set peer sate to LNET_PEER_STATE_ACTIVE
  • TODO: (to be detailed later) The two steps below need to be done with proper lnet_net_lock
    • Run the selection algorithm to determine the pathway
    • Send the message.

On the DD thread

  • send lnet_ping
  • lock_peer
    • set the state of the peer to LNET_PEER_STATE_WAIT_PING_RESPONSE
  • unlock_peer
LNET_EVENT_DLC_ADD_PEER_NI
  • for each peer_ni in list
    • check that the peer_NI are unique
      • if not unique abort operation and roll back (or skip)
    • create the peer_nets if need be
    • add peer_NI to peer nets
    • add peer to the list of peers
  • stay in LNET_PEER_STATE_VERIFY
LNET_EVENT_DLC_DEL_PEER_NI
  • for each peer_ni in list
    • check that the peer_NI are unique
      • if not unique skip that peer_ni
    • delete the peer_ni from the peer or move to a zombie list.
    • if peer_net is empty delete
    • if peer is empty delete
  • stay in LNET_PEER_STATE_VERIFY
LNET_EVENT_DLC_LOCAL_NI_CFG_UPDATENo-op. We will eventually either enter discovery process or not, and we will handle this later.

Investigation Required

  • If we want to force a discovery, we can create an event FORCE_DISCOVERY on a specific NID
    • Find the peer to which this NID's peer_ni belongs
    • Detach all peer_NIs from that peer
    • Initiate discovery on each of the NID requested and transition the peer state approrpriately

 

  • No labels

2 Comments

  1. Some sketches of what code would/could look like if the scheme I proposed is followed. Not complete, the hardest part looks to be peer merging code.

    #define LNET_PEER_NIDS_SENT				(1<<0)
    #define LNET_PEER_NIDS_RECVD			(1<<1)
    #define LNET_PEER_MULTI_RAIL			(1<<2)
    #define LNET_PEER_NID_DATA_RECVD		(1<<3)
    #define LNET_PEER_PUSH_ACK_PENDING		(1<<4)
    #define LNET_PEER_PING_REPLY_PENDING	(1<<5)
    #define LNET_PEER_DLC_CONFIG			(1<<6)
    #define LNET_PEER_QUEUED				(1<<7)
    
    #define LNET_PEER_NIDS_UPTODATE    \
        ( LNET_PEER_NIDS_SENT | LNET_PEER_NIDS_RECVD )
    #define LNET_PEER_NID_DATA_PENDING    \
        ( LNET_PEER_PUSH_ACK_PENDING | LNET_PEER_PING_REPLY_PENDING )
    #define LNET_PEER_LOCAL_NIDS_UPTODATE(peer) \
        ((peer)->nid_seqno >= lnet.nid_seqno)
    
    lnet_peer_wait_on_discovery_locked(peer)
    {
        DECLARE_WAITQUEUE(wait, current);
        if (!(peer->state & LNET_PEER_QUEUED)) {
            lnet_peer_hold(peer);
            lnet_discovery_queue_lock();
            list_add_tail(&peer->list, &lnet_discovery_list);
            peer->state |= LNET_PEER_QUEUED;
            if (lnet_discovery_state & LNET_DISCOVERY_SLEEPING)
                wake_up(lnet_discovery_wait);
            lnet_discovery_queue_unlock();
        }
        add_wait_queue_exclusive(&peer->waitqueue, &wait);
        set_current_state(TASK_INTERRUPTIBLE);
        lnet_peer_unlock(peer);
        schedule();
        set_task_state(current, TASK_RUNNING);
        remove_wait_queue(&peer->waitqueue, &wait);
        lnet_lock_peer(peer);
    }
    
    int
    lnet_peer_discovery(peer, msg)
    {
        if (msg->type == LNET_MSG_ACK)
            return;
        if (msg->type == LNET_MSG_REPLY)
            return;
        if (msg->type == LNET_MSG_PUT &&
            msg->put.ptl_index == LNET_RESERVED_PORTAL)
            return;
        if (msg->type == LNET_MSG_GET &&
            msg->get.ptl_index == LNET_RESERVED_PORTAL)
            return;
        if (!lnet_peer_discovery_enabled)
            return;
        peer = peer_ni->pni_net->pn_peer;
        lnet_lock_peer(peer);
        /* This tests a pair of flags */
        if ((peer->state & LNET_PEER_NIDS_UPTODATE) != LNET_PEER_NIDS_UPTODATE)
            lnet_peer_wait_on_discovery_locked(peer);
        lnet_unlock_peer(peer);
    }
    
    int
    lnet_send()
    {
        ...
        lnet_peer_discovery(peer, msg);
        ...
    }
    
    lnet_dequeue_peer(peer);
    {
        lnet_discovery_lock();
        list_del_init(&peer->lp_list);
        peer->lp_state &= !LNET_PEER_QUEUED;
        lnet_discovery_unlock();
        wake_up(peer->waitqueue);
        lnet_peer_rele(peer);
        lnet_peer_unlock(peer);
    }
    
    /*
     * Maybe three lists:
     *  queued peers
     *  peers being worked on
     *  peers waiting on a message
     */
    int
    lnet_discovery_thread()
    {
        init;
        lnet_discovery_lock();
        while (!lnet_discovery_state.shutdown) {
            lnet_discovery_thread_sleep();
            if (lnet_discover_state.shutdown)
                break;
            if (list_empty(&lnet_discovery_list))
                continue;
            list_splice_init(&lnet_discovery_list, &lnet_working_list);
            lnet_discovery_unlock();
            while (!list_empty(&lnet_working_list)) {
                /*
                 * Re-check nid recv MD size. This is triggered
                 * the code that handles a ping reply or push
                 * message, and that code also queues the peer
                 * on the discovery list.
                 */
                if (lnet_nid_data_size < lnet_nid_data_needed_size)
                    lnet_resize_nid_data_buffers();
                peer = list_first_entry(&lnet_working_list,
                            struct lnet_peer, lp_list);
                /* A hold was taken on the peer when it was queued. */
                lnet_peer_lock(peer);
                if (peer->state & LNET_PEER_NID_DATA_RECEIVED)
                    lnet_peer_merge_nid_data_locked(peer);
                /*
                 * This test can be removed if a separare queue is
                 *  used for peers waiting on a reply.
                 */
                if (peer->state & LNET_PEER_NID_DATA_PENDING) {
                    lnet_discovery_lock();
                    list_move(&peer->list, &lnet_discovery_list);
                    lnet_discovery_unlock();
                } else if (lnet_discovery_disabled) {
                    lnet_dequeue_peer_locked(peer);
                } else if (!(peer->state & LNET_PEER_NIDS_RECVD)) {
                    lnet_ping_peer_locked(peer);
                } else if (!(peer->state & LNET_PEER_IS_MULTI_RAIL)) {
                    lnet_dequeue_peer_locked(peer);
                } else if (peer->nid_seqno < lnet.nid_seqno) {
                    lnet_push_peer_locked(peer);
                } else {
                    lnet_dequeue_peer_locked(peer);
                }
                /*
                 * At this point the peer has either been
                 * dequeued, re-added to the discovery_list,
                 * or added to to the nid_data_pending_list.
                 * If we dequeued we also dropped a hold, and
                 * the peer might no longer exist after the
                 * unlock.
                 */
                lnet_peer_unlock(peer);
            }
            lnet_discovery_lock();
        }
        lnet_discovery_unlock();
        fini;
    }
  2. Some additional sketches:

    /*
     * Push event handler -- passive side push
     */
    static void
    lnet_push_event_handler(lnet_event_t *event)
    {
        lnet_push_data_t *pushd = event->md.user_ptr;
        lnet_ping_data_t *data = event->md.start;
        lnet_ping_data_t *buf = NULL;
        truncated = 0;
        if (event->type == LNET_EVENT_UNLINK)
            goto check_unlinked;
        /* Catch various malformed data. */
        if (!(data->pi_features & LNET_PING_FEAT_NI_STATUS)) {
            /* Old-style, shouldn't happen. */
        } else if (!(data->pi_features & LNET_PING_FEAT_MULTI_RAIL)) {
            /* This is a push, should not happen. */
        } else if (data->pi_nnis == 0) {
            /* Bad, loopback should always be there... */
        } else (LNET_NETTYP(LNET_NIDNET(data->pi_ni[0].ns_nid)) != LOLND) {
            /* Loopback not first, badly formatted. */
        }
        /* Locking? */
        /* Are the buffers on the MD large enough? */
        if (data->pi_nnis > the_lnet.current_nnis) {
            /* No, the data was truncated. */
            if (data->pi_nnis > the_lnet.needed_nnis)
                the_lnet.needed_nnis = data->pi_nnis;
            truncated = 1;
        }
        /* event->sender is NID of sender */
        cpt = lnet_cpt_of_nid(event->sender);
        lnet_net_lock(cpt);
        rc = lnet_nid2peer_ni_locked(&peer_ni, event->sender, cpt);
        if (rc) {
            lnet_net_unlock(cpt);
            CERROR();
            return;
        }
        peer = peer_ni->pni_net->pn_peer;
        lnet_peer_addref(peer);
        lnet_net_unlock(cpt);
        lnet_peer_lock(peer);
        LASSERT(event->type == LNET_EVENT_PUT);
        /* Our NIDS state of the peer is no longer up-to-date. */
        peer->state &= ~LNET_PEER_NIDS_RECVD;
        peer->state |= LNET_PEER_MULTI_RAIL;
        
        if (truncated) {
            /* Data was truncated. */
            if (peer->state & LNET_PEER_NID_DATA_RECVD &&
                data->pi_ni[0].ns_status > buf->pi_ni[0].ns_status) {
                /*
                 * The peer sent newer data than the packet we have
                 * yet to process. Drop that packet now and
                 * trust we can ping the peer.
                 *
                 * Is this the right thing to do? Maybe we should
                 * merge anyway, to ensure we do have the best
                 * possible update for the interface list.
                 */
                buf = peer->lp_nid_data;
                LIBCFS_FREE(buf);
                peer->state &= ~LNET_PEER_NID_DATA_RECVD;
            }
            goto unlock_peer;
        }
        if (peer->state & LNET_PEER_NID_DATA_RECVD) {
            /* A race, a data buffer is already attached. */
            buf = peer->lp_nid_data;
            peer->lp_nid_data = NULL;
            /* If the attached data is newer we're done. */
            if (data->pi_ni[0].ns_status <= buf->pi_ni[0].ns_status)
                goto unlock_peer;
            if (data->pi_nnis > buf->pi_nnis) {
                /* There is buf but it is too small. */
                LIBCFS_FREE(buf);
                LIBCFS_ALLOC_ATOMIC(buf, size);
            }
        } else {
            /* CPT-alloc? */
            LIBCFS_ALLOC_ATOMIC(buf, size);
        }
        /*
         * A special case could be failing to allocate the buf for
         * data that proclaims the peer shut down its last interface.
         * (But if a ping is triggered that failure will be telling.)
         */
        if (buf) {
            memcpy(buf, data, size);
            peer->lp_nid_data = buf;
            peer->state |= LNET_PEER_NID_DATA_RECVD;
        } else {
            peer->state &= ~LNET_PEER_NID_DATA_RECVD;
        }
        /* Maybe update peer_ni state here -- but maybe not. */
        for (i = 1; i < buf->pi_nnis; i++) {
            ...
        }
    unlock_peer:
        if (!(peer->state & LNET_PEER_QUEUED)) {
            lnet_peer_addref(peer);
            lnet_discovery_queue_lock();
            list_add_tail(&peer->list, &lnet_discovery_list);
            peer->state |= LNET_PEER_QUEUED;
            if (lnet_discovery_state & LNET_DISCOVERY_SLEEPING)
                wake_up(lnet_discovery_wait);
            lnet_discovery_queue_unlock();
        }
        lnet_peer_unlock(peer);
        lnet_peer_decref(peer);
        lnet_peer_ni_decref(peer_ni); /* Hold from nid2peer_ni */
    check_unlinked:
        if (event->unlinked) {
            LNetInvalidateHandle(...);
        }
    }
    
    /*
     * Push ack event handler -- active side push
     */
    static void
    lnet_push_ack_event_handler(lnet_event_t *event)
    {
        if (event->type == LNET_EVENT_UNLINK)
            goto check_unlinked;
        /* event->target.nid is NID of peer */
        cpt = lnet_cpt_of_nid(event->target.nid);
        lnet_net_lock(cpt);
        rc = lnet_nid2peer_ni_locked(&peer_ni, event->sender, cpt);
        if (rc) {
            lnet_net_unlock(cpt);
            CERROR();
            return;
        }
        peer = peer_ni->pni_net->pn_peer;
        lnet_peer_addref(peer);
        lnet_net_unlock(cpt);
        lnet_peer_lock(peer);
        if (event->type != LNET_EVENT_ACK) {
            LASSERT(event->type == LNET_EVENT_SEND);
            /*
             * LNET_EVENT_SEND means the push was sent, but
             * nothing else. In particular it doesn't guarantee
             * the push will be received, and moreover we can get
             * the LNET_EVENT_ACK before the LNET_EVENT_SEND!
             *
             * It does indicate that the buffers being sent from
             * can be reused.
             */
            goto unlock_peer;
            /* UNBLOCK_MD instead? */
            /* compare ping_info use */
        }
        if (!(peer->state & LNET_PEER_ACK_PENDING)) {
            /* Huh? */
            goto unlock_peer;
        }
        peer->state &= ~LNET_PEER_ACK_PENDING;
    unblock_md:
        /* MD is "blocked" in that we don't want to change it while in use. */
    unlock_peer:
        if (!(peer->state & LNET_PEER_QUEUED)) {
            lnet_peer_addref(peer);
            lnet_discovery_queue_lock();
            list_add_tail(&peer->list, &lnet_discovery_list);
            peer->state |= LNET_PEER_QUEUED;
            if (lnet_discovery_state & LNET_DISCOVERY_SLEEPING)
                wake_up(lnet_discovery_wait);
            lnet_discovery_queue_unlock();
        }
        lnet_unlock_peer(peer);
        lnet_peer_decref(peer);
        lnet_peer_ni_decref(peer_ni); /* Hold from nid2peer_ni */
    check_unlinked:
        if (event->unlinked) {
            LNetInvalidateHandle(...);
        }
    }
    
    
    /*
     * Adding peer with DLC -- very sketchy
     */
    int
    lnet_peer_add_peer_ni(...)
    {
        peer = ...
        lnet_peer_lock(peer);
        if (peer->state & LNET_PEER_DLC_CONFIG) {
            /* peer matches DLC-provided config */
            ...
        } else {
            /* peer doesn't match DLC-provided config. */
            /* Assuming DLC overrides Discovery...
             *   tag non-DLC peer_ni
             *   I'm inclined to leave these attached until
             *   further notice -- present but disabled. If DLC
             *   and Discovery match when fully applied, then the
             *   peer_ni disabled here will be re-enabled by
             *   subsequent calls to this function.
             * Warn "DLC is redefining discovered peer ..."
             */
            peer->state |= LNET_PEER_DLC_CONFIG;
        }
        /*
         * No longer a match with received NIDS. Clearing this flag
         * triggers verification if that has been enabled.
         */
        if (peer->state & LNET_PEER_NIDS_RECVD) {
            peer->state &= ~LNET_PEER_NIDS_RECVD;
            lnet_queue_peer_locked(peer);
        }
        lnet_peer_unlock(peer);
        ...
    }