Browse Source

gossipd: explicitly track which peers are important.

These don't have a maximum number of reconnect attempts, and ensure
that we try to reconnect when the peer dies.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
ppa-0.6.1
Rusty Russell 7 years ago
parent
commit
c9fa9817f6
  1. 89
      gossipd/gossip.c
  2. 5
      gossipd/gossip_wire.csv
  3. 1
      lightningd/gossip_control.c

89
gossipd/gossip.c

@ -102,12 +102,12 @@ struct reaching {
/* Did we succeed? */ /* Did we succeed? */
bool succeeded; bool succeeded;
/* Is this an important peer to keep connected? */
bool keep_connected;
/* How many times have we attempted to connect? */ /* How many times have we attempted to connect? */
u32 attempts; u32 attempts;
/* How many times to attempt */
u32 max_attempts;
/* Timestamp of the first attempt */ /* Timestamp of the first attempt */
u32 first_attempt; u32 first_attempt;
}; };
@ -158,7 +158,7 @@ struct peer {
bool gossip_sync; bool gossip_sync;
/* If we die, should we reach again? */ /* If we die, should we reach again? */
bool reach_again; bool keep_connected;
/* Only one of these is set: */ /* Only one of these is set: */
struct local_peer_state *local; struct local_peer_state *local;
@ -179,13 +179,14 @@ static struct io_plan *peer_start_gossip(struct io_conn *conn,
struct peer *peer); struct peer *peer);
static bool send_peer_with_fds(struct peer *peer, const u8 *msg); static bool send_peer_with_fds(struct peer *peer, const u8 *msg);
static void wake_pkt_out(struct peer *peer); static void wake_pkt_out(struct peer *peer);
static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id); static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id,
bool keep_connected);
static void destroy_peer(struct peer *peer) static void destroy_peer(struct peer *peer)
{ {
list_del_from(&peer->daemon->peers, &peer->list); list_del_from(&peer->daemon->peers, &peer->list);
if (peer->reach_again) if (peer->keep_connected)
try_reach_peer(peer->daemon, &peer->id); try_reach_peer(peer->daemon, &peer->id, true);
} }
static struct peer *find_peer(struct daemon *daemon, const struct pubkey *id) static struct peer *find_peer(struct daemon *daemon, const struct pubkey *id)
@ -242,7 +243,7 @@ static struct peer *new_peer(const tal_t *ctx,
peer->daemon = daemon; peer->daemon = daemon;
peer->local = new_local_peer_state(peer, cs); peer->local = new_local_peer_state(peer, cs);
peer->remote = NULL; peer->remote = NULL;
peer->reach_again = false; peer->keep_connected = false;
return peer; return peer;
} }
@ -273,18 +274,21 @@ static struct reaching *find_reaching(struct daemon *daemon,
return NULL; return NULL;
} }
static void reached_peer(struct daemon *daemon, const struct pubkey *id, static void reached_peer(struct peer *peer, struct io_conn *conn)
struct io_conn *conn)
{ {
struct reaching *r = find_reaching(daemon, id); struct reaching *r = find_reaching(peer->daemon, &peer->id);
if (!r) if (!r)
return; return;
/* If this peer was important, remember, so we reconnect. */
if (r->keep_connected)
peer->keep_connected = true;
/* OK, we've reached the peer successfully, stop retrying. */ /* OK, we've reached the peer successfully, stop retrying. */
/* Don't free conn with reach. */ /* Don't free conn with reach. */
tal_steal(daemon, conn); tal_steal(peer->daemon, conn);
/* Don't call connect_failed */ /* Don't call connect_failed */
io_set_finish(conn, NULL, NULL); io_set_finish(conn, NULL, NULL);
@ -350,7 +354,7 @@ static struct io_plan *peer_init_received(struct io_conn *conn,
return io_close(conn); return io_close(conn);
} }
reached_peer(peer->daemon, &peer->id, conn); reached_peer(peer, conn);
/* BOLT #7: /* BOLT #7:
* *
@ -1574,7 +1578,7 @@ static void connect_failed(struct io_conn *conn, struct reaching *reach)
u32 diff = time_now().ts.tv_sec - reach->first_attempt; u32 diff = time_now().ts.tv_sec - reach->first_attempt;
reach->attempts++; reach->attempts++;
if (reach->attempts >= reach->max_attempts) { if (!reach->keep_connected && reach->attempts >= 10) {
status_info("Failed to connect after %d attempts, giving up " status_info("Failed to connect after %d attempts, giving up "
"after %d seconds", "after %d seconds",
reach->attempts, diff); reach->attempts, diff);
@ -1724,25 +1728,31 @@ static void try_connect(struct reaching *reach)
} }
/* Returns true if we're already connected. */ /* Returns true if we're already connected. */
static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id) static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id,
bool keep_connected)
{ {
struct reaching *reach; struct reaching *reach;
struct peer *peer; struct peer *peer;
if (find_reaching(daemon, id)) { reach = find_reaching(daemon, id);
/* FIXME: Perhaps kick timer in this case? */ if (reach) {
/* May not have been important before */
if (keep_connected)
reach->keep_connected = true;
status_trace("try_reach_peer: already trying to reach %s", status_trace("try_reach_peer: already trying to reach %s",
type_to_string(tmpctx, struct pubkey, id)); type_to_string(tmpctx, struct pubkey, id));
return false; return false;
} }
/* Master might find out before we do that a peer is dead; if we /* Master might find out before we do that a peer is dead. */
* seem to be connected just mark it for reconnect. */
peer = find_peer(daemon, id); peer = find_peer(daemon, id);
if (peer) { if (peer) {
status_trace("reach_peer: have %s, will retry if it dies", /* May not have been important before */
type_to_string(tmpctx, struct pubkey, id)); if (keep_connected)
peer->reach_again = true; peer->keep_connected = true;
status_trace("reach_peer: have peer %s%s",
type_to_string(tmpctx, struct pubkey, id),
peer->keep_connected ? " (will retry if it dies)" : "");
return true; return true;
} }
@ -1752,7 +1762,7 @@ static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id)
reach->id = *id; reach->id = *id;
reach->first_attempt = time_now().ts.tv_sec; reach->first_attempt = time_now().ts.tv_sec;
reach->attempts = 0; reach->attempts = 0;
reach->max_attempts = 10; reach->keep_connected = keep_connected;
list_add_tail(&daemon->reaching, &reach->list); list_add_tail(&daemon->reaching, &reach->list);
tal_add_destructor(reach, destroy_reaching); tal_add_destructor(reach, destroy_reaching);
@ -1760,7 +1770,6 @@ static bool try_reach_peer(struct daemon *daemon, const struct pubkey *id)
return false; return false;
} }
/* This catches all kinds of failures, like network errors. */
static struct io_plan *reach_peer(struct io_conn *conn, static struct io_plan *reach_peer(struct io_conn *conn,
struct daemon *daemon, const u8 *msg) struct daemon *daemon, const u8 *msg)
{ {
@ -1770,7 +1779,7 @@ static struct io_plan *reach_peer(struct io_conn *conn,
master_badmsg(WIRE_GOSSIPCTL_REACH_PEER, msg); master_badmsg(WIRE_GOSSIPCTL_REACH_PEER, msg);
/* Master can't check this itself, because that's racy. */ /* Master can't check this itself, because that's racy. */
if (try_reach_peer(daemon, &id)) { if (try_reach_peer(daemon, &id, false)) {
daemon_conn_send(&daemon->master, daemon_conn_send(&daemon->master,
take(towire_gossip_peer_already_connected(NULL, take(towire_gossip_peer_already_connected(NULL,
&id))); &id)));
@ -1796,6 +1805,33 @@ static struct io_plan *addr_hint(struct io_conn *conn,
return daemon_conn_read_next(conn, &daemon->master); return daemon_conn_read_next(conn, &daemon->master);
} }
static struct io_plan *peer_important(struct io_conn *conn,
struct daemon *daemon, const u8 *msg)
{
struct pubkey id;
bool important;
struct reaching *r;
struct peer *p;
if (!fromwire_gossipctl_peer_important(msg, &id, &important))
master_badmsg(WIRE_GOSSIPCTL_REACH_PEER, msg);
r = find_reaching(daemon, &id);
p = find_peer(daemon, &id);
/* Override keep_connected flag everywhere */
if (r)
r->keep_connected = important;
if (p)
p->keep_connected = important;
/* If it's important and we're not connected/connecting, do so now. */
if (important && !r && !p)
try_reach_peer(daemon, &id, true);
return daemon_conn_read_next(conn, &daemon->master);
}
static struct io_plan *get_peers(struct io_conn *conn, static struct io_plan *get_peers(struct io_conn *conn,
struct daemon *daemon, const u8 *msg) struct daemon *daemon, const u8 *msg)
{ {
@ -2044,6 +2080,9 @@ static struct io_plan *recv_req(struct io_conn *conn, struct daemon_conn *master
case WIRE_GOSSIPCTL_PEER_ADDRHINT: case WIRE_GOSSIPCTL_PEER_ADDRHINT:
return addr_hint(conn, daemon, master->msg_in); return addr_hint(conn, daemon, master->msg_in);
case WIRE_GOSSIPCTL_PEER_IMPORTANT:
return peer_important(conn, daemon, master->msg_in);
case WIRE_GOSSIP_GETPEERS_REQUEST: case WIRE_GOSSIP_GETPEERS_REQUEST:
return get_peers(conn, daemon, master->msg_in); return get_peers(conn, daemon, master->msg_in);

5
gossipd/gossip_wire.csv

@ -32,6 +32,11 @@ gossipctl_peer_addrhint,,addr,struct wireaddr
gossipctl_reach_peer,3001 gossipctl_reach_peer,3001
gossipctl_reach_peer,,id,struct pubkey gossipctl_reach_peer,,id,struct pubkey
# Master -> gossipd: try to always maintain connection to this peer (or not)
gossipctl_peer_important,3010
gossipctl_peer_important,,id,struct pubkey
gossipctl_peer_important,,important,bool
# Gossipd -> master: we got a peer. Two fds: peer and gossip # Gossipd -> master: we got a peer. Two fds: peer and gossip
gossip_peer_connected,3002 gossip_peer_connected,3002
gossip_peer_connected,,id,struct pubkey gossip_peer_connected,,id,struct pubkey

Can't render this file because it has a wrong number of fields in line 6.

1
lightningd/gossip_control.c

@ -137,6 +137,7 @@ static unsigned gossip_msg(struct subd *gossip, const u8 *msg, const int *fds)
case WIRE_GOSSIP_ROUTING_FAILURE: case WIRE_GOSSIP_ROUTING_FAILURE:
case WIRE_GOSSIP_MARK_CHANNEL_UNROUTABLE: case WIRE_GOSSIP_MARK_CHANNEL_UNROUTABLE:
case WIRE_GOSSIPCTL_PEER_DISCONNECT: case WIRE_GOSSIPCTL_PEER_DISCONNECT:
case WIRE_GOSSIPCTL_PEER_IMPORTANT:
/* This is a reply, so never gets through to here. */ /* This is a reply, so never gets through to here. */
case WIRE_GOSSIPCTL_INIT_REPLY: case WIRE_GOSSIPCTL_INIT_REPLY:
case WIRE_GOSSIP_GET_UPDATE_REPLY: case WIRE_GOSSIP_GET_UPDATE_REPLY:

Loading…
Cancel
Save