From ffa5e1c52c233fc6ae344751de225ee79b7457b3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jun 2017 15:47:03 +0930 Subject: [PATCH] peer_fail: differentiate transient and permanent failures. Signed-off-by: Rusty Russell --- lightningd/gossip_control.c | 8 +-- lightningd/peer_control.c | 117 +++++++++++++++++++++++++++++------- lightningd/peer_control.h | 12 +++- lightningd/peer_htlcs.c | 90 ++++++++++++++------------- 4 files changed, 156 insertions(+), 71 deletions(-) diff --git a/lightningd/gossip_control.c b/lightningd/gossip_control.c index 73fe919e6..3ad340c08 100644 --- a/lightningd/gossip_control.c +++ b/lightningd/gossip_control.c @@ -38,8 +38,7 @@ static void peer_bad_message(struct subd *gossip, const u8 *msg) log_debug(gossip->log, "Peer %s gave bad msg %s", type_to_string(msg, struct pubkey, &peer->id), tal_hex(msg, msg)); - peer_fail(peer, "Bad message %s during gossip phase", - gossip_wire_type_name(fromwire_peektype(msg))); + peer_fail_permanent(peer, msg); } static void peer_failed(struct subd *gossip, const u8 *msg) @@ -57,10 +56,7 @@ static void peer_failed(struct subd *gossip, const u8 *msg) if (!peer) fatal("Gossip gave bad peerid %"PRIu64, unique_id); - log_unusual(gossip->log, "Peer %s failed: %.*s", - type_to_string(msg, struct pubkey, &peer->id), - (int)tal_len(err), (const char *)err); - peer_fail(peer, "Error during gossip phase"); + peer_fail_permanent(peer, msg); } static void peer_nongossip(struct subd *gossip, const u8 *msg, diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c index adc6362b8..cc954fff1 100644 --- a/lightningd/peer_control.c +++ b/lightningd/peer_control.c @@ -84,12 +84,58 @@ static void peer_reconnect(struct peer *peer) try_reconnect, peer); } -void peer_fail(struct peer *peer, const char *fmt, ...) +static void drop_to_chain(struct peer *peer) +{ + /* FIXME: Implement. */ +} + +void peer_fail_permanent(struct peer *peer, const u8 *msg) +{ + /* BOLT #1: + * + * The channel is referred to by `channel_id` unless `channel_id` is + * zero (ie. all bytes zero), in which case it refers to all + * channels. */ + static const struct channel_id all_channels; + + log_unusual(peer->log, "Peer permanent failure in %s: %.*s", + peer_state_name(peer->state), + (int)tal_len(msg), (char *)msg); + peer->error = towire_error(peer, &all_channels, msg); + + /* In case we reconnected in the meantime. */ + if (peer->fd != -1) { + /* FIXME: We should retransmit error if this happens. */ + close(peer->fd); + } + + if (peer_persists(peer)) + drop_to_chain(peer); + else + tal_free(peer); + return; +} + +void peer_internal_error(struct peer *peer, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - log_info(peer->log, "Peer failure in %s: ", + log_broken(peer->log, "Peer internal error %s: ", + peer_state_name(peer->state)); + logv_add(peer->log, fmt, ap); + va_end(ap); + + peer_fail_permanent(peer, + take((u8 *)tal_strdup(peer, "Internal error"))); +} + +void peer_fail_transient(struct peer *peer, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + log_info(peer->log, "Peer transient failure in %s: ", peer_state_name(peer->state)); logv_add(peer->log, fmt, ap); va_end(ap); @@ -133,6 +179,15 @@ static bool peer_start_channeld_hsmfd(struct subd *hsm, const u8 *resp, const int *fds, struct peer *peer); +/* Send (encrypted) error message, then close. */ +static struct io_plan *send_error(struct io_conn *conn, struct peer *peer) +{ + struct peer_crypto_state *pcs = tal(conn, struct peer_crypto_state); + init_peer_crypto_state(peer, pcs); + pcs->cs = *peer->cs; + return peer_write_message(conn, pcs, peer->error, (void *)io_close_cb); +} + /* Returns true if we consider this a reconnection. */ static bool peer_reconnected(struct lightningd *ld, const struct pubkey *id, @@ -150,6 +205,16 @@ static bool peer_reconnected(struct lightningd *ld, tal_free(peer->cs); peer->cs = tal_dup(peer, struct crypto_state, cs); + /* BOLT #2: + * + * On reconnection, if a channel is in an error state, the node SHOULD + * retransmit the error packet and ignore any other packets for that + * channel, and the following requirements do not apply. */ + if (peer->error) { + io_new_conn(peer, fd, send_error, peer); + return true; + } + /* We need this for init */ peer->reconnected = true; @@ -282,6 +347,7 @@ void add_peer(struct lightningd *ld, u64 unique_id, /* Fresh peer. */ peer = tal(ld, struct peer); peer->ld = ld; + peer->error = NULL; peer->unique_id = unique_id; peer->owner = NULL; peer->id = *id; @@ -370,8 +436,11 @@ static void peer_owner_finished(struct subd *subd, int status) } subd->peer->owner = NULL; - peer_fail(subd->peer, "Owning subdaemon %s died (%i)", - subd->name, status); + + /* Don't do a transient error if it's already perm failed. */ + if (!subd->peer->error) + peer_fail_transient(subd->peer, "Owning subdaemon %s died (%i)", + subd->name, status); } static int make_listen_fd(struct lightningd *ld, @@ -946,7 +1015,7 @@ static bool peer_start_channeld_hsmfd(struct subd *hsm, const u8 *resp, if (!peer->owner) { log_unusual(peer->log, "Could not subdaemon channel: %s", strerror(errno)); - peer_fail(peer, "Failed to subdaemon channel"); + peer_fail_transient(peer, "Failed to subdaemon channel"); return true; } @@ -1098,21 +1167,24 @@ static bool opening_funder_finished(struct subd *opening, const u8 *resp, bitcoin_txid(fc->funding_tx, fc->peer->funding_txid); if (!structeq(fc->peer->funding_txid, &funding_txid)) { - peer_fail(fc->peer, "Funding txid mismatch:" - " satoshi %"PRIu64" change %"PRIu64" changeidx %u" - " localkey %s remotekey %s", - fc->peer->funding_satoshi, - fc->change, fc->change_keyindex, - type_to_string(fc, struct pubkey, &local_fundingkey), - type_to_string(fc, struct pubkey, - &channel_info->remote_fundingkey)); + peer_internal_error(fc->peer, + "Funding txid mismatch:" + " satoshi %"PRIu64" change %"PRIu64 + " changeidx %u" + " localkey %s remotekey %s", + fc->peer->funding_satoshi, + fc->change, fc->change_keyindex, + type_to_string(fc, struct pubkey, + &local_fundingkey), + type_to_string(fc, struct pubkey, + &channel_info->remote_fundingkey)); return false; } /* We should have sent and received the first commitsig */ if (!peer_save_commitsig_received(fc->peer, 0) || !peer_save_commitsig_sent(fc->peer, 0)) { - peer_fail(fc->peer, "Saving commitsig failed"); + peer_internal_error(fc->peer, "Saving commitsig failed"); return false; } @@ -1246,10 +1318,12 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer) /* Note: gossipd handles unknown packets, so we don't have to worry * about ignoring odd ones here. */ if (fromwire_peektype(from_peer) != WIRE_OPEN_CHANNEL) { + char *msg = tal_fmt(peer, "Bad message %i (%s) before opening", + fromwire_peektype(from_peer), + wire_type_name(fromwire_peektype(from_peer))); log_unusual(peer->log, "Strange message to exit gossip: %u", fromwire_peektype(from_peer)); - peer_fail(peer, "Bad message during gossiping: %s", - tal_hex(peer, from_peer)); + peer_fail_permanent(peer, (u8 *)take(msg)); return; } @@ -1260,8 +1334,8 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer) take(&peer->fd), &peer->gossip_client_fd, NULL); if (!peer->owner) { - peer_fail(peer, "Failed to subdaemon opening: %s", - strerror(errno)); + peer_fail_transient(peer, "Failed to subdaemon opening: %s", + strerror(errno)); return; } @@ -1294,7 +1368,8 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer) /* Careful here! Their message could push us overlength! */ if (tal_len(msg) >= 65536) { - peer_fail(peer, "Unacceptably long open_channel"); + char *err = tal_strdup(peer, "Unacceptably long open_channel"); + peer_fail_permanent(peer, (u8 *)take(err)); return; } subd_req(peer, peer->owner, take(msg), -1, 1, @@ -1343,8 +1418,8 @@ static bool gossip_peer_released(struct subd *gossip, take(&fc->peer->fd), &fc->peer->gossip_client_fd, NULL); if (!opening) { - peer_fail(fc->peer, "Failed to subdaemon opening: %s", - strerror(errno)); + peer_fail_transient(fc->peer, "Failed to subdaemon opening: %s", + strerror(errno)); return true; } fc->peer->owner = opening; diff --git a/lightningd/peer_control.h b/lightningd/peer_control.h index c409f0701..e8237164e 100644 --- a/lightningd/peer_control.h +++ b/lightningd/peer_control.h @@ -24,6 +24,9 @@ struct peer { /* ID of peer */ struct pubkey id; + /* Error message (iff in error state) */ + u8 *error; + /* Their shachain. */ struct shachain their_shachain; @@ -140,8 +143,13 @@ void peer_fundee_open(struct peer *peer, const u8 *msg); void add_peer(struct lightningd *ld, u64 unique_id, int fd, const struct pubkey *id, const struct crypto_state *cs); -/* Peer has failed. */ -PRINTF_FMT(2,3) void peer_fail(struct peer *peer, const char *fmt, ...); + +/* Peer has failed, but try reconnected. */ +PRINTF_FMT(2,3) void peer_fail_transient(struct peer *peer, const char *fmt,...); +/* Peer has failed, give up on it. */ +void peer_fail_permanent(struct peer *peer, const u8 *msg); +/* Permanent error, but due to internal problems, not peer. */ +void peer_internal_error(struct peer *peer, const char *fmt, ...); const char *peer_state_name(enum peer_state state); void peer_set_condition(struct peer *peer, enum peer_state oldstate, diff --git a/lightningd/peer_htlcs.c b/lightningd/peer_htlcs.c index 5cd502242..a7c0fd2f5 100644 --- a/lightningd/peer_htlcs.c +++ b/lightningd/peer_htlcs.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -334,7 +335,7 @@ static bool rcvd_htlc_reply(struct subd *subd, const u8 *msg, const int *fds, &hout->key.id, &failure_code, &failurestr)) { - log_broken(subd->log, "Bad channel_offer_htlc_reply"); + peer_internal_error(subd->peer, "Bad channel_offer_htlc_reply"); tal_free(hout); return false; } @@ -345,9 +346,10 @@ static bool rcvd_htlc_reply(struct subd *subd, const u8 *msg, const int *fds, } if (find_htlc_out(&subd->ld->htlcs_out, hout->key.peer, hout->key.id)) { - log_broken(subd->log, "Bad offer_htlc_reply HTLC id %"PRIu64 - " is a duplicate", - hout->key.id); + peer_internal_error(subd->peer, + "Bad offer_htlc_reply HTLC id %"PRIu64 + " is a duplicate", + hout->key.id); tal_free(hout); return false; } @@ -530,10 +532,11 @@ static bool state_update_ok(struct peer *peer, expected = SENT_REMOVE_COMMIT; if (newstate != expected) { - log_broken(peer->log, "HTLC %s %"PRIu64" invalid update %s->%s", - dir, htlc_id, - htlc_state_name(oldstate), - htlc_state_name(newstate)); + peer_internal_error(peer, + "HTLC %s %"PRIu64" invalid update %s->%s", + dir, htlc_id, + htlc_state_name(oldstate), + htlc_state_name(newstate)); return false; } @@ -582,8 +585,8 @@ static bool peer_accepted_htlc(struct peer *peer, hin = find_htlc_in(&peer->ld->htlcs_in, peer, id); if (!hin) { - log_broken(peer->log, - "peer_got_revoke unknown htlc %"PRIu64, id); + peer_internal_error(peer, + "peer_got_revoke unknown htlc %"PRIu64, id); return false; } @@ -595,7 +598,7 @@ static bool peer_accepted_htlc(struct peer *peer, sizeof(hin->onion_routing_packet)); if (!op) { if (!memeqzero(&hin->shared_secret, sizeof(hin->shared_secret))){ - log_broken(peer->log, + peer_internal_error(peer, "bad onion in got_revoke: %s", tal_hexstr(peer, hin->onion_routing_packet, sizeof(hin->onion_routing_packet))); @@ -665,9 +668,9 @@ static bool peer_fulfilled_our_htlc(struct peer *peer, hout = find_htlc_out(&peer->ld->htlcs_out, peer, fulfilled->id); if (!hout) { - log_broken(peer->log, - "fulfilled_our_htlc unknown htlc %"PRIu64, - fulfilled->id); + peer_internal_error(peer, + "fulfilled_our_htlc unknown htlc %"PRIu64, + fulfilled->id); return false; } @@ -694,9 +697,9 @@ static bool peer_failed_our_htlc(struct peer *peer, hout = find_htlc_out(&peer->ld->htlcs_out, peer, failed->id); if (!hout) { - log_broken(peer->log, - "failed_our_htlc unknown htlc %"PRIu64, - failed->id); + peer_internal_error(peer, + "failed_our_htlc unknown htlc %"PRIu64, + failed->id); return false; } @@ -763,7 +766,7 @@ static bool update_in_htlc(struct peer *peer, u64 id, enum htlc_state newstate) hin = find_htlc_in(&peer->ld->htlcs_in, peer, id); if (!hin) { - log_broken(peer->log, "Can't find in HTLC %"PRIu64, id); + peer_internal_error(peer, "Can't find in HTLC %"PRIu64, id); return false; } @@ -782,7 +785,7 @@ static bool update_out_htlc(struct peer *peer, u64 id, enum htlc_state newstate) hout = find_htlc_out(&peer->ld->htlcs_out, peer, id); if (!hout) { - log_broken(peer->log, "Can't find out HTLC %"PRIu64, id); + peer_internal_error(peer, "Can't find out HTLC %"PRIu64, id); return false; } @@ -825,14 +828,14 @@ int peer_sending_commitsig(struct peer *peer, const u8 *msg) &commitnum, &changed_htlcs, &commit_sig, &htlc_sigs)) { - log_broken(peer->log, "bad channel_sending_commitsig %s", - tal_hex(peer, msg)); + peer_internal_error(peer, "bad channel_sending_commitsig %s", + tal_hex(peer, msg)); return -1; } for (i = 0; i < tal_count(changed_htlcs); i++) { if (!changed_htlc(peer, changed_htlcs + i)) { - log_broken(peer->log, + peer_internal_error(peer, "channel_sending_commitsig: update failed"); return -1; } @@ -848,7 +851,7 @@ int peer_sending_commitsig(struct peer *peer, const u8 *msg) if (num_local_added != 0) { if (maxid != peer->next_htlc_id + num_local_added - 1) { - log_broken(peer->log, + peer_internal_error(peer, "channel_sending_commitsig:" " Added %"PRIu64", maxid now %"PRIu64 " from %"PRIu64, @@ -944,7 +947,7 @@ static bool peer_sending_revocation(struct peer *peer, bool peer_save_commitsig_received(struct peer *peer, u64 commitnum) { if (commitnum != peer->num_commits_received) { - log_broken(peer->log, + peer_internal_error(peer, "channel_got_commitsig: expected commitnum %"PRIu64 " got %"PRIu64, peer->num_commits_received, commitnum); @@ -961,7 +964,7 @@ bool peer_save_commitsig_received(struct peer *peer, u64 commitnum) bool peer_save_commitsig_sent(struct peer *peer, u64 commitnum) { if (commitnum != peer->num_commits_sent) { - log_broken(peer->log, + peer_internal_error(peer, "channel_sent_commitsig: expected commitnum %"PRIu64 " got %"PRIu64, peer->num_commits_sent, commitnum); @@ -996,8 +999,9 @@ int peer_got_commitsig(struct peer *peer, const u8 *msg) &fulfilled, &failed, &changed)) { - log_broken(peer->log, "bad fromwire_channel_got_commitsig %s", - tal_hex(peer, msg)); + peer_internal_error(peer, + "bad fromwire_channel_got_commitsig %s", + tal_hex(peer, msg)); return -1; } @@ -1026,8 +1030,8 @@ int peer_got_commitsig(struct peer *peer, const u8 *msg) for (i = 0; i < tal_count(changed); i++) { if (!changed_htlc(peer, &changed[i])) { - log_broken(peer->log, - "got_commitsig: update failed"); + peer_internal_error(peer, + "got_commitsig: update failed"); return -1; } } @@ -1068,8 +1072,8 @@ int peer_got_revoke(struct peer *peer, const u8 *msg) &revokenum, &per_commitment_secret, &next_per_commitment_point, &changed)) { - log_broken(peer->log, "bad fromwire_channel_got_revoke %s", - tal_hex(peer, msg)); + peer_internal_error(peer, "bad fromwire_channel_got_revoke %s", + tal_hex(peer, msg)); return -1; } @@ -1087,23 +1091,23 @@ int peer_got_revoke(struct peer *peer, const u8 *msg) return -1; } else { if (!changed_htlc(peer, &changed[i])) { - log_broken(peer->log, - "got_revoke: update failed"); + peer_internal_error(peer, + "got_revoke: update failed"); return -1; } } } if (revokenum >= (1ULL << 48)) { - log_broken(peer->log, "got_revoke: too many txs %"PRIu64, - revokenum); + peer_internal_error(peer, "got_revoke: too many txs %"PRIu64, + revokenum); return -1; } if (revokenum != peer->num_revocations_received) { - log_broken(peer->log, "got_revoke: expected %"PRIu64 - " got %"PRIu64, - peer->num_revocations_received, revokenum); + peer_internal_error(peer, "got_revoke: expected %"PRIu64 + " got %"PRIu64, + peer->num_revocations_received, revokenum); return -1; } @@ -1115,10 +1119,12 @@ int peer_got_revoke(struct peer *peer, const u8 *msg) if (!shachain_add_hash(&peer->their_shachain, shachain_index(revokenum), &per_commitment_secret)) { - peer_fail(peer, "Bad per_commitment_secret %s for %"PRIu64, - type_to_string(msg, struct sha256, - &per_commitment_secret), - revokenum); + char *err = tal_fmt(peer, + "Bad per_commitment_secret %s for %"PRIu64, + type_to_string(msg, struct sha256, + &per_commitment_secret), + revokenum); + peer_fail_permanent(peer, take((u8 *)err)); return -1; }