Browse Source

channeld: treat all incoming errors as "soft", so we retry.

We still close the channel if we *send* an error, but we seem to have hit
another case where LND sends an error which seems transient, so this will
make a best-effort attempt to preserve our channel in that case.

Some test have to be modified, since they don't terminate as they did
previously :(

Changelog-Changed: quirks: We'll now reconnect and retry if we get an error on an established channel. This works around lnd sending error messages that may be non-fatal.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
travis-debug
Rusty Russell 5 years ago
committed by Christian Decker
parent
commit
1d0c433dc4
  1. 7
      channeld/channeld.c
  2. 2
      closingd/closingd.c
  3. 16
      common/peer_failed.c
  4. 4
      common/peer_failed.h
  5. 4
      common/read_peer_msg.c
  6. 2
      common/read_peer_msg.h
  7. 4
      openingd/openingd.c
  8. 10
      tests/test_closing.py
  9. 12
      tests/test_connection.py
  10. 10
      tests/test_pay.py

7
channeld/channeld.c

@ -1824,7 +1824,9 @@ static void peer_in(struct peer *peer, const u8 *msg)
return;
}
if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg))
/* Since LND seems to send errors which aren't actually fatal events,
* we treat errors here as soft. */
if (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true, msg))
return;
/* Must get funding_locked before almost anything. */
@ -2322,7 +2324,8 @@ static void peer_reconnect(struct peer *peer,
do {
clean_tmpctx();
msg = sync_crypto_read(tmpctx, peer->pps);
} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, msg)
} while (handle_peer_gossip_or_error(peer->pps, &peer->channel_id, true,
msg)
|| capture_premature_msg(&premature_msgs, msg));
if (peer->channel->option_static_remotekey) {

2
closingd/closingd.c

@ -103,7 +103,7 @@ static u8 *closing_read_peer_msg(const tal_t *ctx,
handle_gossip_msg(pps, take(msg));
continue;
}
if (!handle_peer_gossip_or_error(pps, channel_id, msg))
if (!handle_peer_gossip_or_error(pps, channel_id, false, msg))
return msg;
}
}

16
common/peer_failed.c

@ -52,25 +52,15 @@ void peer_failed(struct per_peer_state *pps,
/* We're failing because peer sent us an error message */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
const struct channel_id *channel_id)
const struct channel_id *channel_id,
bool soft_error)
{
static const struct channel_id all_channels;
u8 *msg;
bool sync_error;
/* <+roasbeef> rusty: sync error can just be a timing thing
* <+roasbeef> andn doesn't always mean that we can't continue forwrd,
* or y'all sent the wrong info
*/
/* So while LND is sitting in the corner eating paint, back off. */
sync_error = strstr(desc, "sync error");
if (sync_error)
status_unusual("Peer said '%s' so we'll come back later",
desc);
if (!channel_id)
channel_id = &all_channels;
msg = towire_status_peer_error(NULL, channel_id, desc, sync_error, pps,
msg = towire_status_peer_error(NULL, channel_id, desc, soft_error, pps,
NULL);
peer_billboard(true, "Received error from peer: %s", desc);
peer_fatal_continue(take(msg), pps);

4
common/peer_failed.h

@ -22,7 +22,9 @@ void peer_failed(struct per_peer_state *pps,
* channel_id means all channels. */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
const struct channel_id *channel_id) NORETURN;
const struct channel_id *channel_id,
bool soft_error)
NORETURN;
/* I/O error */
void peer_failed_connection_lost(void) NORETURN;

4
common/read_peer_msg.c

@ -148,6 +148,7 @@ bool handle_timestamp_filter(struct per_peer_state *pps, const u8 *msg TAKES)
bool handle_peer_gossip_or_error(struct per_peer_state *pps,
const struct channel_id *channel_id,
bool soft_error,
const u8 *msg TAKES)
{
char *err;
@ -176,7 +177,8 @@ bool handle_peer_gossip_or_error(struct per_peer_state *pps,
if (err)
peer_failed_received_errmsg(pps, err,
all_channels
? NULL : channel_id);
? NULL : channel_id,
soft_error);
/* Ignore unknown channel errors. */
goto handled;

2
common/read_peer_msg.h

@ -58,6 +58,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
* handle_peer_gossip_or_error - simple handler for all the above cases.
* @pps: per-peer state.
* @channel_id: the channel id of the current channel.
* @soft_error: tell lightningd that incoming error is non-fatal.
* @msg: the peer message (only taken if returns true).
*
* This returns true if it handled the packet: a gossip packet (forwarded
@ -66,6 +67,7 @@ bool is_wrong_channel(const u8 *msg, const struct channel_id *expected,
*/
bool handle_peer_gossip_or_error(struct per_peer_state *pps,
const struct channel_id *channel_id,
bool soft_error,
const u8 *msg TAKES);
/**

4
openingd/openingd.c

@ -432,7 +432,7 @@ static u8 *opening_negotiate_msg(const tal_t *ctx, struct state *state,
wire_sync_write(REQ_FD, take(msg));
}
peer_failed_received_errmsg(state->pps, err,
NULL);
NULL, false);
}
negotiation_aborted(state, am_funder,
tal_fmt(tmpctx, "They sent error %s",
@ -1283,7 +1283,7 @@ static u8 *handle_peer_in(struct state *state)
/* Handles standard cases, and legal unknown ones. */
if (handle_peer_gossip_or_error(state->pps,
&state->channel_id, msg))
&state->channel_id, false, msg))
return NULL;
sync_crypto_write(state->pps,

10
tests/test_closing.py

@ -1608,7 +1608,7 @@ def test_shutdown(node_factory):
@flaky
@unittest.skipIf(not DEVELOPER, "needs to set upfront_shutdown_script")
def test_option_upfront_shutdown_script(node_factory, bitcoind):
def test_option_upfront_shutdown_script(node_factory, bitcoind, executor):
l1 = node_factory.get_node(start=False)
# Insist on upfront script we're not going to match.
l1.daemon.env["DEV_OPENINGD_UPFRONT_SHUTDOWN_SCRIPT"] = "76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac"
@ -1618,14 +1618,16 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
l1.fund_channel(l2, 1000000, False)
l1.rpc.close(l2.info['id'])
# This will block, as l12 will send an error but l2 will retry.
fut = executor.submit(l1.rpc.close, l2.info['id'])
# l2 will close unilaterally when it dislikes shutdown script.
l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
# Clear channel.
wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)
bitcoind.generate_block(1)
fut.result(TIMEOUT)
wait_for(lambda: [c['state'] for c in only_one(l1.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])
wait_for(lambda: [c['state'] for c in only_one(l2.rpc.listpeers()['peers'])['channels']] == ['ONCHAIN'])
@ -1636,7 +1638,7 @@ def test_option_upfront_shutdown_script(node_factory, bitcoind):
l2.rpc.close(l1.info['id'])
# l2 will close unilaterally when it dislikes shutdown script.
l1.daemon.wait_for_log(r'received ERROR.*scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
l1.daemon.wait_for_log(r'scriptpubkey .* is not as agreed upfront \(76a91404b61f7dc1ea0dc99424464cc4064dc564d91e8988ac\)')
# Clear channel.
wait_for(lambda: len(bitcoind.rpc.getrawmempool()) != 0)

12
tests/test_connection.py

@ -943,6 +943,7 @@ def test_funding_by_utxos(node_factory, bitcoind):
l1.rpc.fundchannel(l3.info["id"], int(0.01 * 10**8), utxos=utxos)
@unittest.skipIf(not DEVELOPER, "needs dev_forget_channel")
def test_funding_external_wallet_corners(node_factory, bitcoind):
l1 = node_factory.get_node()
l2 = node_factory.get_node()
@ -1000,6 +1001,9 @@ def test_funding_external_wallet_corners(node_factory, bitcoind):
# Canceld channel after fundchannel_complete
assert l1.rpc.fundchannel_cancel(l2.info['id'])['cancelled']
# l2 won't give up, since it considers error "soft".
l2.rpc.dev_forget_channel(l1.info['id'])
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
l1.rpc.fundchannel_start(l2.info['id'], amount)['funding_address']
assert l1.rpc.fundchannel_complete(l2.info['id'], prep['txid'], txout)['commitments_secured']
@ -1431,7 +1435,7 @@ def test_update_fee(node_factory, bitcoind):
@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
def test_fee_limits(node_factory):
def test_fee_limits(node_factory, bitcoind):
l1, l2 = node_factory.line_graph(2, opts={'dev-max-fee-multiplier': 5, 'may_reconnect': True}, fundchannel=True)
# L1 asks for stupid low fee (will actually hit the floor of 253)
@ -1439,11 +1443,13 @@ def test_fee_limits(node_factory):
l1.set_feerates((15, 15, 15), False)
l1.start()
l1.daemon.wait_for_log('Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee 253 outside range 1875-75000')
l1.daemon.wait_for_log('Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee 253 outside range 1875-75000')
# Make sure the resolution of this one doesn't interfere with the next!
# Note: may succeed, may fail with insufficient fee, depending on how
# bitcoind feels!
l1.daemon.wait_for_log('sendrawtx exit')
l2.daemon.wait_for_log('sendrawtx exit')
bitcoind.generate_block(1)
sync_blockheight(bitcoind, [l1, l2])
# Trying to open a channel with too low a fee-rate is denied
l4 = node_factory.get_node()

10
tests/test_pay.py

@ -248,16 +248,20 @@ def test_pay_disconnect(node_factory, bitcoind):
l1.set_feerates((10**6, 1000**6, 1000**6), False)
# Wait for l1 notice
l1.daemon.wait_for_log(r'Peer permanent failure in CHANNELD_NORMAL: channeld: received ERROR channel .*: update_fee \d+ outside range 1875-75000')
l1.daemon.wait_for_log(r'Peer transient failure in CHANNELD_NORMAL: channeld: .*: update_fee \d+ outside range 1875-75000')
# l2 fails hard.
l2.daemon.wait_for_log('sendrawtx exit')
bitcoind.generate_block(1, wait_for_mempool=1)
sync_blockheight(bitcoind, [l1, l2])
# Should fail due to permenant channel fail
with pytest.raises(RpcError, match=r'failed: WIRE_UNKNOWN_NEXT_PEER \(First peer not ready\)'):
with pytest.raises(RpcError, match=r'WIRE_UNKNOWN_NEXT_PEER'):
l1.rpc.sendpay(route, rhash)
assert not l1.daemon.is_in_log('Payment is still in progress')
# After it sees block, someone should close channel.
bitcoind.generate_block(1)
l1.daemon.wait_for_log('ONCHAIN')

Loading…
Cancel
Save