Skip to content

Commit f6bf33a

Browse files
authored
Merge pull request #1191 from HathorNetwork/refactor/p2p-storage-slots
refactor(p2p): move received peers storage to protocol
2 parents 52a8fcb + 2a12952 commit f6bf33a

21 files changed

+321
-102
lines changed

hathor/conf/settings.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ def GENESIS_TX2_TIMESTAMP(self) -> int:
288288
# Maximum period without receiving any messages from ther peer (in seconds).
289289
PEER_IDLE_TIMEOUT: int = 60
290290

291+
# Maximum number of entrypoints that we accept that a peer broadcasts
292+
PEER_MAX_ENTRYPOINTS: int = 30
293+
291294
# Filepath of ca certificate file to generate connection certificates
292295
CA_FILEPATH: str = os.path.join(os.path.dirname(__file__), '../p2p/ca.crt')
293296

@@ -431,6 +434,12 @@ def GENESIS_TX2_TIMESTAMP(self) -> int:
431434
# more than enough for the forseeable future
432435
MAX_MEMPOOL_RECEIVING_TIPS: int = 1000
433436

437+
# Max number of peers simultanously stored in the node
438+
MAX_VERIFIED_PEERS: int = 10_000
439+
440+
# Max number of peers simultanously stored per-connection
441+
MAX_UNVERIFIED_PEERS_PER_CONN: int = 100
442+
434443
# Used to enable nano contracts.
435444
#
436445
# This should NEVER be enabled for mainnet and testnet, since both networks will

hathor/p2p/manager.py

Lines changed: 103 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from collections import deque
1516
from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Optional
1617

1718
from structlog import get_logger
@@ -30,7 +31,7 @@
3031
from hathor.p2p.peer_discovery import PeerDiscovery
3132
from hathor.p2p.peer_endpoint import PeerAddress, PeerEndpoint
3233
from hathor.p2p.peer_id import PeerId
33-
from hathor.p2p.peer_storage import UnverifiedPeerStorage, VerifiedPeerStorage
34+
from hathor.p2p.peer_storage import VerifiedPeerStorage
3435
from hathor.p2p.protocol import HathorProtocol
3536
from hathor.p2p.rate_limiter import RateLimiter
3637
from hathor.p2p.states.ready import ReadyState
@@ -81,10 +82,10 @@ class GlobalRateLimiter:
8182
manager: Optional['HathorManager']
8283
connections: set[HathorProtocol]
8384
connected_peers: dict[PeerId, HathorProtocol]
85+
new_connection_from_queue: deque[PeerId]
8486
connecting_peers: dict[IStreamClientEndpoint, _ConnectingPeer]
8587
handshaking_peers: set[HathorProtocol]
8688
whitelist_only: bool
87-
unverified_peer_storage: UnverifiedPeerStorage
8889
verified_peer_storage: VerifiedPeerStorage
8990
_sync_factories: dict[SyncVersion, SyncAgentFactory]
9091
_enabled_sync_versions: set[SyncVersion]
@@ -156,12 +157,12 @@ def __init__(
156157
# List of peers connected and ready to communicate.
157158
self.connected_peers = {}
158159

159-
# List of peers received from the network.
160-
# We cannot trust their identity before we connect to them.
161-
self.unverified_peer_storage = UnverifiedPeerStorage()
160+
# Queue of ready peer-id's used by connect_to_peer_from_connection_queue to choose the next peer to pull a
161+
# random new connection from
162+
self.new_connection_from_queue = deque()
162163

163164
# List of known peers.
164-
self.verified_peer_storage = VerifiedPeerStorage() # dict[string (peer.id), PublicPeer]
165+
self.verified_peer_storage = VerifiedPeerStorage(rng=self.rng, max_size=self._settings.MAX_VERIFIED_PEERS)
165166

166167
# Maximum unseen time before removing a peer (seconds).
167168
self.max_peer_unseen_dt: float = 30 * 60 # 30-minutes
@@ -181,6 +182,11 @@ def __init__(
181182
# Timestamp of the last time sync was updated.
182183
self._last_sync_rotate: float = 0.
183184

185+
# Connect to new peers in a timed loop, instead of as soon as possible
186+
self.lc_connect = LoopingCall(self.connect_to_peer_from_connection_queue)
187+
self.lc_connect.clock = self.reactor
188+
self.lc_connect_interval = 0.2 # seconds
189+
184190
# A timer to try to reconnect to the disconnect known peers.
185191
if self._settings.ENABLE_PEER_WHITELIST:
186192
self.wl_reconnect = LoopingCall(self.update_whitelist)
@@ -272,7 +278,7 @@ def do_discovery(self) -> None:
272278
Do a discovery and connect on all discovery strategies.
273279
"""
274280
for peer_discovery in self.peer_discoveries:
275-
coro = peer_discovery.discover_and_connect(self.connect_to)
281+
coro = peer_discovery.discover_and_connect(self.connect_to_endpoint)
276282
Deferred.fromCoroutine(coro)
277283

278284
def disable_rate_limiter(self) -> None:
@@ -293,6 +299,7 @@ def start(self) -> None:
293299
if self.manager is None:
294300
raise TypeError('Class was built incorrectly without a HathorManager.')
295301

302+
self._start_peer_connect_loop()
296303
self.lc_reconnect.start(5, now=False)
297304
self.lc_sync_update.start(self.lc_sync_update_interval, now=False)
298305

@@ -319,7 +326,28 @@ def _handle_whitelist_reconnect_err(self, *args: Any, **kwargs: Any) -> None:
319326
self.log.error('whitelist reconnect had an exception. Start looping call again.', args=args, kwargs=kwargs)
320327
self.reactor.callLater(30, self._start_whitelist_reconnect)
321328

329+
def _start_peer_connect_loop(self) -> None:
330+
# The deferred returned by the LoopingCall start method
331+
# executes when the looping call stops running
332+
# https://docs.twistedmatrix.com/en/stable/api/twisted.internet.task.LoopingCall.html
333+
d = self.lc_connect.start(self.lc_connect_interval, now=True)
334+
d.addErrback(self._handle_peer_connect_err)
335+
336+
def _handle_peer_connect_err(self, *args: Any, **kwargs: Any) -> None:
337+
# This method will be called when an exception happens inside the peer connect loop
338+
# and ends up stopping the looping call.
339+
# We log the error and start the looping call again.
340+
self.log.error(
341+
'connect_to_peer_from_connection_queue had an exception. Start looping call again.',
342+
args=args,
343+
kwargs=kwargs,
344+
)
345+
self.reactor.callLater(self.lc_connect_interval, self._start_peer_connect_loop)
346+
322347
def stop(self) -> None:
348+
if self.lc_connect.running:
349+
self.lc_connect.stop()
350+
323351
if self.lc_reconnect.running:
324352
self.lc_reconnect.stop()
325353

@@ -406,10 +434,10 @@ def on_peer_ready(self, protocol: HathorProtocol) -> None:
406434
"""Called when a peer is ready."""
407435
assert protocol.peer is not None
408436
self.verified_peer_storage.add_or_replace(protocol.peer)
409-
assert protocol.peer.id is not None
410437

411438
self.handshaking_peers.remove(protocol)
412-
self.unverified_peer_storage.pop(protocol.peer.id, None)
439+
for conn in self.iter_all_connections():
440+
conn.unverified_peer_storage.remove(protocol.peer)
413441

414442
# we emit the event even if it's a duplicate peer as a matching
415443
# NETWORK_PEER_DISCONNECTED will be emitted regardless
@@ -419,7 +447,8 @@ def on_peer_ready(self, protocol: HathorProtocol) -> None:
419447
peers_count=self._get_peers_count()
420448
)
421449

422-
if protocol.peer.id in self.connected_peers:
450+
peer_id = protocol.peer.id
451+
if peer_id in self.connected_peers:
423452
# connected twice to same peer
424453
self.log.warn('duplicate connection to peer', protocol=protocol)
425454
conn = self.get_connection_to_drop(protocol)
@@ -428,15 +457,19 @@ def on_peer_ready(self, protocol: HathorProtocol) -> None:
428457
# the new connection is being dropped, so don't save it to connected_peers
429458
return
430459

431-
self.connected_peers[protocol.peer.id] = protocol
460+
self.connected_peers[peer_id] = protocol
461+
if peer_id not in self.new_connection_from_queue:
462+
self.new_connection_from_queue.append(peer_id)
463+
else:
464+
self.log.warn('peer already in queue', peer=str(peer_id))
432465

433466
# In case it was a retry, we must reset the data only here, after it gets ready
434467
protocol.peer.info.reset_retry_timestamp()
435468

436469
if len(self.connected_peers) <= self.MAX_ENABLED_SYNC:
437470
protocol.enable_sync()
438471

439-
if protocol.peer.id in self.always_enable_sync:
472+
if peer_id in self.always_enable_sync:
440473
protocol.enable_sync()
441474

442475
# Notify other peers about this new peer connection.
@@ -456,7 +489,8 @@ def on_peer_disconnect(self, protocol: HathorProtocol) -> None:
456489
if protocol in self.handshaking_peers:
457490
self.handshaking_peers.remove(protocol)
458491
if protocol._peer is not None:
459-
existing_protocol = self.connected_peers.pop(protocol.peer.id, None)
492+
peer_id = protocol.peer.id
493+
existing_protocol = self.connected_peers.pop(peer_id, None)
460494
if existing_protocol is None:
461495
# in this case, the connection was closed before it got to READY state
462496
return
@@ -466,7 +500,10 @@ def on_peer_disconnect(self, protocol: HathorProtocol) -> None:
466500
# A check for duplicate connections is done during PEER_ID state, but there's still a
467501
# chance it can happen if both connections start at the same time and none of them has
468502
# reached READY state while the other is on PEER_ID state
469-
self.connected_peers[protocol.peer.id] = existing_protocol
503+
self.connected_peers[peer_id] = existing_protocol
504+
elif peer_id in self.new_connection_from_queue:
505+
# now we're sure it can be removed from new_connection_from_queue
506+
self.new_connection_from_queue.remove(peer_id)
470507
self.pubsub.publish(
471508
HathorEvents.NETWORK_PEER_DISCONNECTED,
472509
protocol=protocol,
@@ -499,15 +536,6 @@ def is_peer_connected(self, peer_id: PeerId) -> bool:
499536
"""
500537
return peer_id in self.connected_peers
501538

502-
def on_receive_peer(self, peer: UnverifiedPeer, origin: Optional[ReadyState] = None) -> None:
503-
""" Update a peer information in our storage, and instantly attempt to connect
504-
to it if it is not connected yet.
505-
"""
506-
if peer.id == self.my_peer.id:
507-
return
508-
peer = self.unverified_peer_storage.add_or_merge(peer)
509-
self.connect_to_if_not_connected(peer, int(self.reactor.seconds()))
510-
511539
def peers_cleanup(self) -> None:
512540
"""Clean up aged peers."""
513541
now = self.reactor.seconds()
@@ -523,11 +551,45 @@ def peers_cleanup(self) -> None:
523551
for remove_peer in to_be_removed:
524552
self.verified_peer_storage.remove(remove_peer)
525553

526-
def reconnect_to_all(self) -> None:
527-
""" It is called by the `lc_reconnect` timer and tries to connect to all known
528-
peers.
554+
def connect_to_peer_from_connection_queue(self) -> None:
555+
""" It is called by the `lc_connect` looping call and tries to connect to a new peer.
556+
"""
557+
if not self.new_connection_from_queue:
558+
self.log.debug('connection queue is empty')
559+
return
560+
assert self.manager is not None
561+
self.log.debug('connect to peer from connection queue')
562+
candidate_new_peers: list[UnverifiedPeer]
563+
# we don't know if we will find a candidate, so we can't do `while True:`
564+
for _ in range(len(self.new_connection_from_queue)):
565+
# for a deque([1, 2, 3, 4]) this will get 1 and modify it to deque([2, 3, 4, 1])
566+
next_from_peer_id = self.new_connection_from_queue[0]
567+
self.new_connection_from_queue.rotate(-1)
568+
569+
protocol = self.connected_peers.get(next_from_peer_id)
570+
if protocol is None:
571+
self.log.error('expected protocol not found', peer_id=str(next_from_peer_id))
572+
assert self.new_connection_from_queue.pop() == next_from_peer_id
573+
continue
574+
candidate_new_peers = [
575+
candidate_peer
576+
for candidate_peer_id, candidate_peer in protocol.unverified_peer_storage.items()
577+
if candidate_peer_id not in self.connected_peers or candidate_peer_id not in self.connecting_peers
578+
]
579+
if candidate_new_peers:
580+
break
581+
else:
582+
self.log.debug('no new peers in the connection queue')
583+
# this means we rotated through the whole queue and did not find any candidate
584+
return
529585

530-
TODO(epnichols): Should we always connect to *all*? Should there be a max #?
586+
peer = self.rng.choice(candidate_new_peers)
587+
self.log.debug('random peer chosen', peer=str(peer.id), entrypoints=peer.info.entrypoints_as_str())
588+
now = self.reactor.seconds()
589+
self.connect_to_peer(peer, int(now))
590+
591+
def reconnect_to_all(self) -> None:
592+
""" It is called by the `lc_reconnect` timer and tries to connect to all known peers.
531593
"""
532594
self.peers_cleanup()
533595
# when we have no connected peers left, run the discovery process again
@@ -536,10 +598,10 @@ def reconnect_to_all(self) -> None:
536598
if now - self._last_discovery >= self.PEER_DISCOVERY_INTERVAL:
537599
self._last_discovery = now
538600
self.do_discovery()
539-
# We need to use list() here because the dict might change inside connect_to_if_not_connected
601+
# We need to use list() here because the dict might change inside connect_to_peer
540602
# when the peer is disconnected and without entrypoint
541603
for peer in list(self.verified_peer_storage.values()):
542-
self.connect_to_if_not_connected(peer, int(now))
604+
self.connect_to_peer(peer, int(now))
543605

544606
def update_whitelist(self) -> Deferred[None]:
545607
from twisted.web.client import readBody
@@ -582,7 +644,7 @@ def _update_whitelist_cb(self, body: bytes) -> None:
582644
for peer_id in peers_to_remove:
583645
self.manager.remove_peer_from_whitelist_and_disconnect(peer_id)
584646

585-
def connect_to_if_not_connected(self, peer: UnverifiedPeer | PublicPeer, now: int) -> None:
647+
def connect_to_peer(self, peer: UnverifiedPeer | PublicPeer, now: int) -> None:
586648
""" Attempts to connect if it is not connected to the peer.
587649
"""
588650
if not peer.info.entrypoints or (
@@ -602,15 +664,16 @@ def connect_to_if_not_connected(self, peer: UnverifiedPeer | PublicPeer, now: in
602664
assert peer.id is not None
603665
if peer.info.can_retry(now):
604666
if self.enable_ipv6 and not self.disable_ipv4:
605-
addr = self.rng.choice(peer.info.entrypoints)
667+
addr = self.rng.choice(list(peer.info.entrypoints))
606668
elif self.enable_ipv6 and self.disable_ipv4:
607669
addr = self.rng.choice(peer.info.get_ipv6_only_entrypoints())
608670
elif not self.enable_ipv6 and not self.disable_ipv4:
609671
addr = self.rng.choice(peer.info.get_ipv4_only_entrypoints())
610672
else:
611673
raise ValueError('IPv4 is disabled and IPv6 is not enabled')
612-
613-
self.connect_to(addr.with_id(peer.id), peer)
674+
self.connect_to_endpoint(addr.with_id(peer.id), peer)
675+
else:
676+
self.log.debug('connecting too often, skip retrying', peer=str(peer.id))
614677

615678
def _connect_to_callback(
616679
self,
@@ -628,14 +691,17 @@ def _connect_to_callback(
628691
protocol.wrappedProtocol.on_outbound_connect(entrypoint, peer)
629692
self.connecting_peers.pop(endpoint)
630693

631-
def connect_to(
694+
def connect_to_endpoint(
632695
self,
633696
entrypoint: PeerEndpoint,
634697
peer: UnverifiedPeer | PublicPeer | None = None,
635698
use_ssl: bool | None = None,
636699
) -> None:
637-
""" Attempt to connect to a peer, even if a connection already exists.
638-
Usually you should call `connect_to_if_not_connected`.
700+
""" Attempt to connect directly to an endpoint, prefer calling `connect_to_peer` when possible.
701+
702+
This method does not take into account the peer's id (since we might not even know it, or have verified it even
703+
if we know). But this method will check if there's already a connection open to the given endpoint and skip it
704+
if there is one.
639705
640706
If `use_ssl` is True, then the connection will be wraped by a TLS.
641707
"""
@@ -747,7 +813,7 @@ def update_hostname_entrypoints(self, *, old_hostname: str | None, new_hostname:
747813

748814
def _add_hostname_entrypoint(self, hostname: str, address: IPv4Address | IPv6Address) -> None:
749815
hostname_entrypoint = PeerAddress.from_hostname_address(hostname, address)
750-
self.my_peer.info.entrypoints.append(hostname_entrypoint)
816+
self.my_peer.info.entrypoints.add(hostname_entrypoint)
751817

752818
def get_connection_to_drop(self, protocol: HathorProtocol) -> HathorProtocol:
753819
""" When there are duplicate connections, determine which one should be dropped.

0 commit comments

Comments
 (0)