channeld: treat all incoming errors as "soft", so we retry.

We still close the channel if we *send* an error, but we seem to have hit another case where LND sends an error which seems transient, so this will make a best-effort attempt to preserve our channel in that case. Some test have to be modified, since they don't terminate as they did previously :( Changelog-Changed: quirks: We'll now reconnect and retry if we get an error on an established channel. This works around lnd sending error messages that may be non-fatal. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
5 years ago · 1d0c433dc4
10 changed files with 41 additions and 30 deletions
--- a/channeld/channeld.c
+++ b/channeld/channeld.c
@ -1824,7 +1824,9 @@ static void peer_in(struct peer *peer, const u8 *msg)
 		return;
 	}

-	if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg))
+	/* Since LND seems to send errors which aren't actually fatal events,
+	 * we treat errors here as soft. */
+	if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true, msg))
 		return;

 	/* Must get funding_locked before almost anything. */
@ -2322,7 +2324,8 @@ static void peer_reconnect(struct peer *peer,
 	do {
 		clean_tmpctx();
 		msg = sync_crypto_read(tmpctx, peer->pps);
-	} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg)
+	} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true,
+					     msg)
 		 || capture_premature_msg(&premature_msgs, msg));

 	if (peer->channel->option_static_remotekey) {
--- a/closingd/closingd.c
+++ b/closingd/closingd.c
@ -103,7 +103,7 @@ static u8 *closing_read_peer_msg(const tal_t *ctx,
 			handle_gossip_msg(pps, take(msg));
 			continue;
 		}
-		if (!handle_peer_gossip_or_error(pps, channel_id, msg))
+		if (!handle_peer_gossip_or_error(pps, channel_id, false, msg))
 			return msg;
 	}
 }
--- a/common/peer_failed.c
+++ b/common/peer_failed.c
@ -52,25 +52,15 @@ void peer_failed(struct per_peer_state *pps,
 /* We're failing because peer sent us an error message */
 void peer_failed_received_errmsg(struct per_peer_state *pps,
 				 const char *desc,
-				 const struct channel_id *channel_id)
+				 const struct channel_id *channel_id,
+				 bool soft_error)
 {
 	static const struct channel_id all_channels;
 	u8 *msg;
-	bool sync_error;
-
-	/* <+roasbeef> rusty: sync error can just be a timing thing
-	 * <+roasbeef> andn doesn't always mean that we can't continue forwrd,
-	 *             or y'all sent the wrong info
-	 */
-	/* So while LND is sitting in the corner eating paint, back off. */
-	sync_error = strstr(desc, "sync error");
-	if (sync_error)
-		status_unusual("Peer said '%s' so we'll come back later",
-			       desc);

 	if (!channel_id)
 		channel_id = &all_channels;
-	msg = towire_status_peer_error(NULL, channel_id, desc, sync_error, pps,
+	msg = towire_status_peer_error(NULL, channel_id, desc, soft_error, pps,
 				       NULL);
 	peer_billboard(true, "Received error from peer: %s", desc);
 	peer_fatal_continue(take(msg), pps);
--- a/common/peer_failed.h
+++ b/common/peer_failed.h
@ -22,7 +22,9 @@ void peer_failed(struct per_peer_state *pps,
 * channel_id means all channels. */
 void peer_failed_received_errmsg(struct per_peer_state *pps,
 				 const char *desc,
-				 const struct channel_id *channel_id) NORETURN;
+				 const struct channel_id *channel_id,
+				 bool soft_error)
+	NORETURN;

 /* I/O error */
 void peer_failed_connection_lost(void) NORETURN;
--- a/common/read_peer_msg.c
+++ b/common/read_peer_msg.c
@ -148,6 +148,7 @@ bool handle_timestamp_filter(struct per_peer_state *pps, const u8 *msg TAKES)

 bool handle_peer_gossip_or_error(struct per_peer_state *pps,
 				 const struct channel_id *channel_id,
+				 bool soft_error,
 				 const u8 *msg TAKES)
 {
 	char *err;
@ -176,7 +177,8 @@ bool handle_peer_gossip_or_error(struct per_peer_state *pps,
 		if (err)
 			peer_failed_received_errmsg(pps, err,
 						    all_channels
-						    ? NULL : channel_id);
+						    ? NULL : channel_id,
+						    soft_error);

 		/* Ignore unknown channel errors. */
 		goto handled;
--- a/common/read_peer_msg.h
+++ b/common/read_peer_msg.h
@ -58,6 +58,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
 * handle_peer_gossip_or_error - simple handler for all the above cases.
 * @pps: per-peer state.
 * @channel_id: the channel id of the current channel.
+ * @soft_error: tell lightningd that incoming error is non-fatal.
 * @msg: the peer message (only taken if returns true).
 *
 * This returns true if it handled the packet: a gossip packet (forwarded
@ -66,6 +67,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
 */
 bool handle_peer_gossip_or_error(struct per_peer_state *pps,
 				 const struct channel_id *channel_id,
+				 bool soft_error,
 				 const u8 *msg TAKES);

 /**
--- a/openingd/openingd.c
+++ b/openingd/openingd.c
@ -432,7 +432,7 @@ static u8 *opening_negotiate_msg(const tal_t *ctx, struct state *state,
 					wire_sync_write(REQ_FD, take(msg));
 				}
 				peer_failed_received_errmsg(state->pps, err,
-							    NULL);
+							    NULL, false);
 			}
 			negotiation_aborted(state, am_funder,
 					    tal_fmt(tmpctx, "They sent error %s",
@ -1283,7 +1283,7 @@ static u8 *handle_peer_in(struct state *state)

 	/* Handles standard cases, and legal unknown ones. */
 	if (handle_peer_gossip_or_error(state->pps,
-					&state->channel_id, msg))
+					&state->channel_id, false, msg))
 		return NULL;

 	sync_crypto_write(state->pps,
--- a/tests/test_closing.py
+++ b/tests/test_closing.py
@ -1608,7 +1608,7 @@ def test_shutdown(node_factory):

@flaky
@unittest.skipIf(not DEVELOPER, "needs to set upfront_shutdown_script")
-def test_option_upfront_shutdown_script(node_factory, bitcoind):
+def test_option_upfront_shutdown_script(node_factory, bitcoind, executor):
    l1 = node_factory.get_node(start=False)
    # Insist on upfront script we're not going to match.
    l1.daemon.env["DEV_OPENINGD_UPFRONT_SHUTDOWN_SCRIPT"] = "76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac"
@ -1618,14 +1618,16 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
    l1.fund_channel(l2, 1000000, False)

-    l1.rpc.close(l2.info['id'])
+    # This will block, as l12 will send an error but l2 will retry.
+    fut = executor.submit(l1.rpc.close, l2.info['id'])

    # l2 will close unilaterally when it dislikes shutdown script.
-    l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
+    l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')

    # Clear channel.
    wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)
    bitcoind.generate_block(1)
+    fut.result(TIMEOUT)
    wait_for(lambda: [c['state'] for c in only_one(l1.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])
    wait_for(lambda: [c['state'] for c in only_one(l2.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])

@ -1636,7 +1638,7 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
    l2.rpc.close(l1.info['id'])

    # l2 will close unilaterally when it dislikes shutdown script.
-    l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
+    l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')

    # Clear channel.
    wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@ -943,6 +943,7 @@ def test_funding_by_utxos(node_factory, bitcoind):
        l1.rpc.fundchannel(l3.info["id"], int(0.01 * 10**8), utxos=utxos)


+@unittest.skipIf(not DEVELOPER, "needs dev_forget_channel")
 def test_funding_external_wallet_corners(node_factory, bitcoind):
    l1 = node_factory.get_node()
    l2 = node_factory.get_node()
@ -1000,6 +1001,9 @@ def test_funding_external_wallet_corners(node_factory, bitcoind):
    # Canceld channel after fundchannel_complete
    assert l1.rpc.fundchannel_cancel(l2.info['id'])['cancelled']

+    # l2 won't give up, since it considers error "soft".
+    l2.rpc.dev_forget_channel(l1.info['id'])
+
    l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
    l1.rpc.fundchannel_start(l2.info['id'], amount)['funding_address']
    assert l1.rpc.fundchannel_complete(l2.info['id'], prep['txid'], txout)['commitments_secured']
@ -1431,7 +1435,7 @@ def test_update_fee(node_factory, bitcoind):


@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
-def test_fee_limits(node_factory):
+def test_fee_limits(node_factory, bitcoind):
    l1, l2 = node_factory.line_graph(2, opts={'dev-max-fee-multiplier': 5, 'may_reconnect': True}, fundchannel=True)

    # L1 asks for stupid low fee (will actually hit the floor of 253)
@ -1439,11 +1443,13 @@ def test_fee_limits(node_factory):
    l1.set_feerates((15, 15, 15), False)
    l1.start()

-    l1.daemon.wait_for_log('Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee 253 outside range 1875-75000')
+    l1.daemon.wait_for_log('Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee 253 outside range 1875-75000')
    # Make sure the resolution of this one doesn't interfere with the next!
    # Note: may succeed, may fail with insufficient fee, depending on how
    # bitcoind feels!
-    l1.daemon.wait_for_log('sendrawtx exit')
+    l2.daemon.wait_for_log('sendrawtx exit')
+    bitcoind.generate_block(1)
+    sync_blockheight(bitcoind, [l1, l2])

    # Trying to open a channel with too low a fee-rate is denied
    l4 = node_factory.get_node()
--- a/tests/test_pay.py
+++ b/tests/test_pay.py
@ -248,16 +248,20 @@ def test_pay_disconnect(node_factory, bitcoind):
    l1.set_feerates((10**6, 1000**6, 1000**6), False)

    # Wait for l1 notice
-    l1.daemon.wait_for_log(r'Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee \d+ outside range 1875-75000')
+    l1.daemon.wait_for_log(r'Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee \d+ outside range 1875-75000')
+
+    # l2 fails hard.
+    l2.daemon.wait_for_log('sendrawtx exit')
+    bitcoind.generate_block(1, wait_for_mempool=1)
+    sync_blockheight(bitcoind, [l1, l2])

    # Should fail due to permenant channel fail
-    with pytest.raises(RpcError, match=r'failed: WIRE_UNKNOWN_NEXT_PEER \(First peer not ready\)'):
+    with pytest.raises(RpcError, match=r'WIRE_UNKNOWN_NEXT_PEER'):
        l1.rpc.sendpay(route, rhash)

    assert not l1.daemon.is_in_log('Payment is still in progress')

    # After it sees block, someone should close channel.
-    bitcoind.generate_block(1)
    l1.daemon.wait_for_log('ONCHAIN')