Skip to content

Commit

Permalink
Merge pull request #181 from greatest-ape/work-2023-01-29-b
Browse files Browse the repository at this point in the history
udp: replace PanicSentinel with loop over JoinHandles
  • Loading branch information
greatest-ape authored Jan 29, 2024
2 parents 1807c4a + 60c5a9c commit a86eb68
Show file tree
Hide file tree
Showing 10 changed files with 174 additions and 101 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
* Speed up parsing and serialization of requests and responses by using
[zerocopy](https://crates.io/crates/zerocopy)

#### Fixed

* Quit whole application if any worker thread quits

### aquatic_http

#### Added
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 4 additions & 10 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## High priority

* general
* http and ws
* add task to generate prometheus exports on regular interval to clean up
data. this is important if peer_clients is activated

Expand All @@ -12,7 +12,9 @@
In that case, a shorter duration (e.g., 30 seconds) would be a good idea.

* general
* panic sentinel not working? at least seemingly not in http?
* Replace panic sentinel with checking threads like in udp implementation.
It seems to be broken


## Medium priority

Expand All @@ -25,14 +27,6 @@
* toml v0.7
* syn v2.0

* quit whole program if any thread panics
* But it would be nice not to panic in workers, but to return errors instead.
Once JoinHandle::is_finished is available in stable Rust (#90470), an
option would be to
* Save JoinHandles
* When preparing to quit because of PanicSentinel sending SIGTERM, loop
through them, extract error and log it

* Run cargo-deny in CI

* aquatic_ws
Expand Down
3 changes: 2 additions & 1 deletion crates/udp/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ name = "aquatic_udp"
[features]
default = ["prometheus"]
# Export prometheus metrics
prometheus = ["metrics", "metrics-util", "metrics-exporter-prometheus"]
prometheus = ["metrics", "metrics-util", "metrics-exporter-prometheus", "tokio"]
# Experimental io_uring support (Linux 6.0 or later required)
io-uring = ["dep:io-uring"]
# Experimental CPU pinning support
Expand Down Expand Up @@ -59,6 +59,7 @@ tinytemplate = "1"
metrics = { version = "0.22", optional = true }
metrics-util = { version = "0.16", optional = true }
metrics-exporter-prometheus = { version = "0.13", optional = true, default-features = false, features = ["http-listener"] }
tokio = { version = "1", optional = true, features = ["rt", "net", "time"] }

# io-uring feature
io-uring = { version = "0.6", optional = true }
Expand Down
191 changes: 139 additions & 52 deletions crates/udp/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@ pub mod config;
pub mod workers;

use std::collections::BTreeMap;
use std::thread::Builder;
use std::fmt::Display;
use std::thread::{sleep, Builder, JoinHandle};
use std::time::Duration;

use anyhow::Context;
use crossbeam_channel::{bounded, unbounded};
use signal_hook::consts::{SIGTERM, SIGUSR1};
use signal_hook::consts::SIGUSR1;
use signal_hook::iterator::Signals;

use aquatic_common::access_list::update_access_list;
#[cfg(feature = "cpu-pinning")]
use aquatic_common::cpu_pinning::{pin_current_if_configured_to, WorkerIndex};
use aquatic_common::privileges::PrivilegeDropper;
use aquatic_common::{PanicSentinelWatcher, ServerStartInstant};
use aquatic_common::ServerStartInstant;

use common::{
ConnectedRequestSender, ConnectedResponseSender, SocketWorkerIndex, State, SwarmWorkerIndex,
Expand All @@ -28,12 +29,12 @@ pub const APP_NAME: &str = "aquatic_udp: UDP BitTorrent tracker";
pub const APP_VERSION: &str = env!("CARGO_PKG_VERSION");

pub fn run(config: Config) -> ::anyhow::Result<()> {
let mut signals = Signals::new([SIGUSR1, SIGTERM])?;
let mut signals = Signals::new([SIGUSR1])?;

let state = State::new(config.swarm_workers);
let connection_validator = ConnectionValidator::new(&config)?;
let (sentinel_watcher, sentinel) = PanicSentinelWatcher::create_with_sentinel();
let priv_dropper = PrivilegeDropper::new(config.privileges.clone(), config.socket_workers);
let mut join_handles = Vec::new();

update_access_list(&config.access_list, &state.access_list)?;

Expand Down Expand Up @@ -62,14 +63,13 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
}

for i in 0..config.swarm_workers {
let sentinel = sentinel.clone();
let config = config.clone();
let state = state.clone();
let request_receiver = request_receivers.remove(&i).unwrap().clone();
let response_sender = ConnectedResponseSender::new(response_senders.clone());
let statistics_sender = statistics_sender.clone();

Builder::new()
let handle = Builder::new()
.name(format!("swarm-{:02}", i + 1))
.spawn(move || {
#[cfg(feature = "cpu-pinning")]
Expand All @@ -81,7 +81,6 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
);

let mut worker = SwarmWorker {
_sentinel: sentinel,
config,
state,
server_start_instant,
Expand All @@ -91,13 +90,14 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
worker_index: SwarmWorkerIndex(i),
};

worker.run();
worker.run()
})
.with_context(|| "spawn swarm worker")?;

join_handles.push((WorkerType::Swarm(i), handle));
}

for i in 0..config.socket_workers {
let sentinel = sentinel.clone();
let state = state.clone();
let config = config.clone();
let connection_validator = connection_validator.clone();
Expand All @@ -106,7 +106,7 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
let response_receiver = response_receivers.remove(&i).unwrap();
let priv_dropper = priv_dropper.clone();

Builder::new()
let handle = Builder::new()
.name(format!("socket-{:02}", i + 1))
.spawn(move || {
#[cfg(feature = "cpu-pinning")]
Expand All @@ -118,46 +118,48 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
);

workers::socket::run_socket_worker(
sentinel,
state,
config,
connection_validator,
server_start_instant,
request_sender,
response_receiver,
priv_dropper,
);
)
})
.with_context(|| "spawn socket worker")?;

join_handles.push((WorkerType::Socket(i), handle));
}

if config.statistics.active() {
let sentinel = sentinel.clone();
let state = state.clone();
let config = config.clone();

#[cfg(feature = "prometheus")]
if config.statistics.run_prometheus_endpoint {
use metrics_exporter_prometheus::PrometheusBuilder;
use metrics_util::MetricKindMask;
let handle = Builder::new()
.name("statistics".into())
.spawn(move || {
#[cfg(feature = "cpu-pinning")]
pin_current_if_configured_to(
&config.cpu_pinning,
config.socket_workers,
config.swarm_workers,
WorkerIndex::Util,
);

PrometheusBuilder::new()
.idle_timeout(
MetricKindMask::ALL,
Some(Duration::from_secs(config.statistics.interval * 2)),
)
.with_http_listener(config.statistics.prometheus_endpoint_address)
.install()
.with_context(|| {
format!(
"Install prometheus endpoint on {}",
config.statistics.prometheus_endpoint_address
)
})?;
}
workers::statistics::run_statistics_worker(config, state, statistics_receiver)
})
.with_context(|| "spawn statistics worker")?;

Builder::new()
.name("statistics".into())
join_handles.push((WorkerType::Statistics, handle));
}

#[cfg(feature = "prometheus")]
if config.statistics.active() && config.statistics.run_prometheus_endpoint {
let config = config.clone();

let handle = Builder::new()
.name("prometheus".into())
.spawn(move || {
#[cfg(feature = "cpu-pinning")]
pin_current_if_configured_to(
Expand All @@ -167,14 +169,73 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
WorkerIndex::Util,
);

workers::statistics::run_statistics_worker(
sentinel,
config,
state,
statistics_receiver,
use metrics_exporter_prometheus::PrometheusBuilder;
use metrics_util::MetricKindMask;

let rt = ::tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.context("build prometheus tokio runtime")?;

rt.block_on(async {
let (recorder, exporter) = PrometheusBuilder::new()
.idle_timeout(
MetricKindMask::ALL,
Some(Duration::from_secs(config.statistics.interval * 2)),
)
.with_http_listener(config.statistics.prometheus_endpoint_address)
.build()
.context("build prometheus recorder and exporter")?;

::tokio::spawn(async move {
let mut interval = ::tokio::time::interval(Duration::from_secs(5));

loop {
interval.tick().await;

// Periodically render metrics to make sure
// idles are cleaned up
recorder.handle().render();
}
});

exporter.await.context("run prometheus exporter")
})
})
.with_context(|| "spawn prometheus exporter worker")?;

join_handles.push((WorkerType::Prometheus, handle));
}

// Spawn signal handler thread
{
let config = config.clone();

let handle: JoinHandle<anyhow::Result<()>> = Builder::new()
.name("signals".into())
.spawn(move || {
#[cfg(feature = "cpu-pinning")]
pin_current_if_configured_to(
&config.cpu_pinning,
config.socket_workers,
config.swarm_workers,
WorkerIndex::Util,
);

for signal in &mut signals {
match signal {
SIGUSR1 => {
let _ = update_access_list(&config.access_list, &state.access_list);
}
_ => unreachable!(),
}
}

Ok(())
})
.with_context(|| "spawn statistics worker")?;
.context("spawn signal worker")?;

join_handles.push((WorkerType::Signals, handle));
}

#[cfg(feature = "cpu-pinning")]
Expand All @@ -185,21 +246,47 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
WorkerIndex::Util,
);

for signal in &mut signals {
match signal {
SIGUSR1 => {
let _ = update_access_list(&config.access_list, &state.access_list);
}
SIGTERM => {
if sentinel_watcher.panic_was_triggered() {
return Err(anyhow::anyhow!("worker thread panicked"));
loop {
for (i, (_, handle)) in join_handles.iter().enumerate() {
if handle.is_finished() {
let (worker_type, handle) = join_handles.remove(i);

match handle.join() {
Ok(Ok(())) => {
return Err(anyhow::anyhow!("{} stopped", worker_type));
}
Ok(Err(err)) => {
return Err(err.context(format!("{} stopped", worker_type)));
}
Err(_) => {
return Err(anyhow::anyhow!("{} panicked", worker_type));
}
}

break;
}
_ => unreachable!(),
}

sleep(Duration::from_secs(5));
}
}

enum WorkerType {
Swarm(usize),
Socket(usize),
Statistics,
Signals,
#[cfg(feature = "prometheus")]
Prometheus,
}

Ok(())
impl Display for WorkerType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Swarm(index) => f.write_fmt(format_args!("Swarm worker {}", index + 1)),
Self::Socket(index) => f.write_fmt(format_args!("Socket worker {}", index + 1)),
Self::Statistics => f.write_str("Statistics worker"),
Self::Signals => f.write_str("Signals worker"),
#[cfg(feature = "prometheus")]
Self::Prometheus => f.write_str("Prometheus worker"),
}
}
}
Loading

0 comments on commit a86eb68

Please sign in to comment.