WIP: стабилизация звонков и E2EE + инструменты сборки WebRTC

This commit is contained in:
2026-03-25 22:20:24 +05:00
parent 530047c5d0
commit eea650face
8 changed files with 1119 additions and 219 deletions

View File

@@ -45,6 +45,280 @@ static void diag_write(const char *fmt, ...) {
if (n > 0) write(g_diag_fd, buf, n);
}
/* ── RTP helpers (for cases when additional_data is empty) ───── */
struct ParsedRtpPacket {
size_t header_size = 0;
uint16_t sequence = 0;
uint32_t timestamp = 0;
uint32_t ssrc = 0;
};
struct RtpProbeState {
bool locked = false;
bool has_probe = false;
uint32_t probe_ssrc = 0;
uint16_t probe_sequence = 0;
uint32_t probe_timestamp = 0;
uint32_t ssrc = 0;
uint16_t last_sequence = 0;
uint32_t last_timestamp = 0;
};
struct GeneratedTsState {
bool initialized = false;
uint32_t next_timestamp = 0;
uint32_t next_step = 960; // 20 ms @ 48 kHz (default Opus packetization)
};
struct AdditionalTsState {
bool initialized64 = false;
bool initialized32 = false;
uint64_t base64 = 0;
uint32_t base32 = 0;
};
static inline uint16_t load16_be(const uint8_t* p) {
return (uint16_t)(((uint16_t)p[0] << 8) | (uint16_t)p[1]);
}
static inline uint32_t load32_be(const uint8_t* p) {
return ((uint32_t)p[0] << 24) |
((uint32_t)p[1] << 16) |
((uint32_t)p[2] << 8) |
((uint32_t)p[3]);
}
static inline uint64_t load64_be(const uint8_t* p) {
return ((uint64_t)p[0] << 56) |
((uint64_t)p[1] << 48) |
((uint64_t)p[2] << 40) |
((uint64_t)p[3] << 32) |
((uint64_t)p[4] << 24) |
((uint64_t)p[5] << 16) |
((uint64_t)p[6] << 8) |
((uint64_t)p[7]);
}
static inline void store64_be(uint8_t* p, uint64_t v) {
p[0] = (uint8_t)(v >> 56);
p[1] = (uint8_t)(v >> 48);
p[2] = (uint8_t)(v >> 40);
p[3] = (uint8_t)(v >> 32);
p[4] = (uint8_t)(v >> 24);
p[5] = (uint8_t)(v >> 16);
p[6] = (uint8_t)(v >> 8);
p[7] = (uint8_t)(v);
}
static bool parse_rtp_packet(const uint8_t* data, size_t len, ParsedRtpPacket* out) {
if (!data || !out || len < 12) return false;
// RTP version must be 2.
const uint8_t version = (data[0] >> 6) & 0x03;
if (version != 2) return false;
const size_t csrc_count = (size_t)(data[0] & 0x0F);
const bool has_extension = (data[0] & 0x10) != 0;
size_t header = 12 + csrc_count * 4;
if (header > len) return false;
if (has_extension) {
// Extension header: 16-bit profile + 16-bit length (in 32-bit words)
if (len < header + 4) return false;
const uint16_t ext_len_words =
(uint16_t)(((uint16_t)data[header + 2] << 8) | (uint16_t)data[header + 3]);
const size_t ext_bytes = (size_t)ext_len_words * 4;
header += 4 + ext_bytes;
if (header > len) return false;
}
const size_t payload_size = len - header;
if (payload_size == 0 || payload_size > 1200) return false;
out->header_size = header;
out->sequence = load16_be(data + 2);
out->timestamp = load32_be(data + 4);
out->ssrc = load32_be(data + 8);
return true;
}
static bool fill_nonce_from_rtp_frame(const uint8_t* data,
size_t len,
RtpProbeState* state,
uint8_t nonce[24],
size_t* header_size) {
if (!state) return false;
ParsedRtpPacket packet;
if (!parse_rtp_packet(data, len, &packet)) return false;
if (!state->locked) {
if (!state->has_probe) {
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
const bool same_ssrc = packet.ssrc == state->probe_ssrc;
const uint16_t seq_delta = (uint16_t)(packet.sequence - state->probe_sequence);
const bool sequence_progressed = seq_delta > 0 && seq_delta <= 10;
if (!same_ssrc || !sequence_progressed) {
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
state->locked = true;
state->has_probe = false;
state->ssrc = packet.ssrc;
state->last_sequence = packet.sequence;
state->last_timestamp = packet.timestamp;
} else {
if (packet.ssrc != state->ssrc) {
state->locked = false;
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
const uint16_t seq_delta = (uint16_t)(packet.sequence - state->last_sequence);
// Accept in-order packets and small jumps (packet loss).
if (seq_delta != 0 && seq_delta <= 200) {
state->last_sequence = packet.sequence;
state->last_timestamp = packet.timestamp;
} else if (seq_delta != 0) {
// Not plausible for a continuous stream: re-probe.
state->locked = false;
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
}
nonce[4] = (uint8_t)(packet.timestamp >> 24);
nonce[5] = (uint8_t)(packet.timestamp >> 16);
nonce[6] = (uint8_t)(packet.timestamp >> 8);
nonce[7] = (uint8_t)(packet.timestamp);
if (header_size) *header_size = packet.header_size;
return true;
}
static bool fill_nonce_from_additional_data(const uint8_t* data,
size_t len,
uint8_t nonce[24],
AdditionalTsState* ts_state,
bool normalize_timestamps,
bool* used_normalized,
bool* used_rtp_header) {
if (used_normalized) *used_normalized = false;
if (used_rtp_header) *used_rtp_header = false;
if (!data || len < 8) return false;
// Common native WebRTC layout: additional_data is RTP header bytes.
if (len >= 12) {
const uint8_t version = (data[0] >> 6) & 0x03;
if (version == 2) {
uint32_t ts = load32_be(data + 4);
if (normalize_timestamps && ts_state) {
if (!ts_state->initialized32) {
ts_state->initialized32 = true;
ts_state->base32 = ts;
}
ts = (uint32_t)(ts - ts_state->base32);
if (used_normalized) *used_normalized = true;
}
nonce[4] = (uint8_t)(ts >> 24);
nonce[5] = (uint8_t)(ts >> 16);
nonce[6] = (uint8_t)(ts >> 8);
nonce[7] = (uint8_t)(ts);
if (used_rtp_header) *used_rtp_header = true;
return true;
}
}
// Generic 8-byte timestamp layout (desktop's nonce[0..7] layout).
uint64_t ts = load64_be(data);
if (normalize_timestamps && ts_state) {
if (!ts_state->initialized64) {
ts_state->initialized64 = true;
ts_state->base64 = ts;
}
ts = (uint64_t)(ts - ts_state->base64);
if (used_normalized) *used_normalized = true;
}
store64_be(nonce, ts);
return true;
}
static inline void fill_nonce_from_ts32(uint32_t ts, uint8_t nonce[24]) {
nonce[4] = (uint8_t)(ts >> 24);
nonce[5] = (uint8_t)(ts >> 16);
nonce[6] = (uint8_t)(ts >> 8);
nonce[7] = (uint8_t)(ts);
}
static inline uint32_t opus_base_frame_samples(uint8_t config) {
// RFC 6716 TOC config mapping at 48 kHz.
if (config <= 11) {
// SILK: 10/20/40/60 ms
static const uint32_t kSilk[4] = {480, 960, 1920, 2880};
return kSilk[config & 0x03];
}
if (config <= 15) {
// Hybrid: 10/20 ms
return (config & 0x01) ? 960 : 480;
}
// CELT-only: 2.5/5/10/20 ms
static const uint32_t kCelt[4] = {120, 240, 480, 960};
return kCelt[config & 0x03];
}
static uint32_t infer_opus_packet_duration_samples(const uint8_t* packet, size_t len) {
if (!packet || len == 0) return 960;
const uint8_t toc = packet[0];
const uint8_t config = (uint8_t)(toc >> 3);
const uint8_t frame_code = (uint8_t)(toc & 0x03);
uint32_t frame_count = 1;
if (frame_code == 1 || frame_code == 2) {
frame_count = 2;
} else if (frame_code == 3) {
if (len < 2) return 960;
frame_count = (uint32_t)(packet[1] & 0x3F);
if (frame_count == 0 || frame_count > 48) return 960;
}
uint32_t base = opus_base_frame_samples(config);
uint32_t total = base * frame_count;
if (total < 120 || total > 5760) return 960;
return total;
}
static bool is_plausible_opus_packet(const uint8_t* packet, size_t len) {
if (!packet || len == 0 || len > 2000) return false;
const uint8_t toc = packet[0];
const uint8_t config = (uint8_t)(toc >> 3);
if (config > 31) return false;
const uint8_t frame_code = (uint8_t)(toc & 0x03);
if (frame_code != 3) return true;
if (len < 2) return false;
const uint8_t frame_count = (uint8_t)(packet[1] & 0x3F);
if (frame_count == 0 || frame_count > 48) return false;
const uint32_t total = opus_base_frame_samples(config) * (uint32_t)frame_count;
return total <= 5760;
}
/* ── Native crash handler — writes to file before dying ──────── */
static char g_crash_path[512] = {0};
@@ -98,57 +372,114 @@ public:
}
/**
* Frame format: [4-byte counter BE] + [xchacha20_xor(frame)]
* Desktop-compatible frame format: ciphertext only (no custom prefix).
*
* Nonce (24 bytes): [0,0,0,0, counter_BE_4bytes, 0,...,0]
* This matches Desktop's layout where nonce[4..7] = timestamp.
* The counter is embedded so the receiver can reconstruct the nonce
* even if frames are dropped/reordered.
* Nonce (24 bytes) is derived exactly like desktop:
* - nonce[0..3] = 0
* - nonce[4..7] = RTP timestamp
* - nonce[8..23] = 0
*
* Primary source of timestamp: additional_data[4..7] (if provided by WebRTC).
* Fallback (Android path where additional_data can be empty):
* parse RTP header from frame and take timestamp from frame[4..7].
*
* If RTP header is found inside frame, we leave header bytes unencrypted
* and encrypt only payload (desktop-compatible).
*/
int Encrypt(cricket::MediaType /*media_type*/,
uint32_t /*ssrc*/,
rtc::ArrayView<const uint8_t> /*additional_data*/,
rtc::ArrayView<const uint8_t> additional_data,
rtc::ArrayView<const uint8_t> frame,
rtc::ArrayView<uint8_t> encrypted_frame,
size_t* bytes_written) override {
const size_t HEADER = 4; // counter prefix
if (frame.size() == 0 || encrypted_frame.size() < frame.size() + HEADER) {
if (frame.size() == 0 || encrypted_frame.size() < frame.size()) {
*bytes_written = 0;
return -1;
}
uint32_t ctr = counter_.fetch_add(1, std::memory_order_relaxed);
size_t header_size = 0;
bool nonce_from_rtp_header = false;
bool nonce_from_generated_ts = false;
bool nonce_from_additional_data = false;
bool nonce_from_additional_normalized = false;
bool additional_was_rtp_header = false;
uint32_t generated_ts_used = 0;
// Write 4-byte counter as big-endian prefix
encrypted_frame.data()[0] = (uint8_t)(ctr >> 24);
encrypted_frame.data()[1] = (uint8_t)(ctr >> 16);
encrypted_frame.data()[2] = (uint8_t)(ctr >> 8);
encrypted_frame.data()[3] = (uint8_t)(ctr);
// Build nonce from counter (same positions as Desktop's timestamp)
// Build nonce from RTP timestamp in additional_data (preferred).
uint8_t nonce[24] = {0};
nonce[4] = encrypted_frame.data()[0];
nonce[5] = encrypted_frame.data()[1];
nonce[6] = encrypted_frame.data()[2];
nonce[7] = encrypted_frame.data()[3];
nonce_from_additional_data = fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce,
&additional_ts_,
true,
&nonce_from_additional_normalized,
&additional_was_rtp_header);
if (!nonce_from_additional_data) {
nonce_from_rtp_header =
fill_nonce_from_rtp_frame(frame.data(), frame.size(), &rtp_probe_, nonce, &header_size);
if (!nonce_from_rtp_header) {
if (!generated_ts_.initialized) {
generated_ts_.initialized = true;
generated_ts_.next_timestamp = 0;
generated_ts_.next_step = 960;
}
nonce_from_generated_ts = true;
generated_ts_used = generated_ts_.next_timestamp;
fill_nonce_from_ts32(generated_ts_used, nonce);
}
}
rosetta_xchacha20_xor(encrypted_frame.data() + HEADER,
frame.data(), frame.size(), nonce, key_);
*bytes_written = frame.size() + HEADER;
if (nonce_from_rtp_header && header_size <= frame.size()) {
// Keep RTP header clear, encrypt payload only.
if (header_size > 0) {
memcpy(encrypted_frame.data(), frame.data(), header_size);
}
const size_t payload_size = frame.size() - header_size;
rosetta_xchacha20_xor(
encrypted_frame.data() + header_size,
frame.data() + header_size,
payload_size,
nonce,
key_);
} else {
// Legacy path: frame is payload-only.
rosetta_xchacha20_xor(encrypted_frame.data(),
frame.data(), frame.size(), nonce, key_);
}
*bytes_written = frame.size();
if (nonce_from_generated_ts) {
const uint32_t step = infer_opus_packet_duration_samples(frame.data(), frame.size());
generated_ts_.next_step = step;
generated_ts_.next_timestamp = generated_ts_used + step;
}
// Diag: log first 3 frames
int n = diag_count_.fetch_add(1, std::memory_order_relaxed);
if (n < 3) {
LOGI("ENC frame#%d: sz=%zu ctr=%u out=%zu",
n, frame.size(), ctr, frame.size() + HEADER);
diag_write("ENC frame#%d: sz=%zu ctr=%u nonce[4..7]=%02x%02x%02x%02x\n",
n, frame.size(), ctr, nonce[4], nonce[5], nonce[6], nonce[7]);
const char* mode =
nonce_from_rtp_header
? "rtp"
: (nonce_from_generated_ts
? "gen"
: (nonce_from_additional_data
? (additional_was_rtp_header
? (nonce_from_additional_normalized ? "ad-rtp-norm" : "ad-rtp")
: (nonce_from_additional_normalized ? "raw-norm" : "raw-abs"))
: "raw-abs"));
LOGI("ENC frame#%d: sz=%zu ad=%zu hdr=%zu mode=%s nonce=%02x%02x%02x%02x",
n, frame.size(), additional_data.size(), header_size, mode,
nonce[4], nonce[5], nonce[6], nonce[7]);
diag_write("ENC frame#%d: sz=%zu ad=%zu hdr=%zu mode=%s nonce[4..7]=%02x%02x%02x%02x\n",
n, frame.size(), additional_data.size(), header_size, mode,
nonce[4], nonce[5], nonce[6], nonce[7]);
}
return 0;
}
size_t GetMaxCiphertextByteSize(cricket::MediaType, size_t frame_size) override {
return frame_size + 4; // +4 for counter prefix
return frame_size;
}
protected:
@@ -156,8 +487,10 @@ protected:
private:
mutable std::atomic<int> ref_{0};
mutable std::atomic<uint32_t> counter_{0};
mutable std::atomic<int> diag_count_{0};
mutable RtpProbeState rtp_probe_;
mutable GeneratedTsState generated_ts_;
mutable AdditionalTsState additional_ts_;
uint8_t key_[32];
};
@@ -185,57 +518,180 @@ public:
}
/**
* Decrypt frame: read 4-byte counter prefix → derive nonce → decrypt.
* If frame has no prefix (< 5 bytes or from Desktop), fallback to
* nonce derived from additional_data (RTP header) or zeros.
* Desktop-compatible decrypt:
* - nonce from RTP timestamp
* - if RTP header is present inside encrypted_frame (fallback path),
* keep header bytes untouched and decrypt payload only.
*/
Result Decrypt(cricket::MediaType /*media_type*/,
const std::vector<uint32_t>& /*csrcs*/,
rtc::ArrayView<const uint8_t> additional_data,
rtc::ArrayView<const uint8_t> encrypted_frame,
rtc::ArrayView<uint8_t> frame) override {
const size_t HEADER = 4;
uint8_t nonce[24] = {0};
const uint8_t *payload;
size_t payload_sz;
if (encrypted_frame.size() > HEADER) {
// Android format: [4-byte counter] + [encrypted data]
nonce[4] = encrypted_frame.data()[0];
nonce[5] = encrypted_frame.data()[1];
nonce[6] = encrypted_frame.data()[2];
nonce[7] = encrypted_frame.data()[3];
payload = encrypted_frame.data() + HEADER;
payload_sz = encrypted_frame.size() - HEADER;
} else {
// Fallback: no counter prefix
payload = encrypted_frame.data();
payload_sz = encrypted_frame.size();
size_t header_size = 0;
bool nonce_from_rtp_header = false;
bool nonce_from_generated_ts = false;
bool nonce_from_additional_data = false;
bool nonce_from_additional_normalized = false;
bool additional_was_rtp_header = false;
bool used_absolute_additional_fallback = false;
uint32_t generated_ts_used = 0;
nonce_from_additional_data = fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce,
&additional_ts_,
true,
&nonce_from_additional_normalized,
&additional_was_rtp_header);
if (!nonce_from_additional_data) {
nonce_from_rtp_header =
fill_nonce_from_rtp_frame(encrypted_frame.data(), encrypted_frame.size(), &rtp_probe_, nonce, &header_size);
if (!nonce_from_rtp_header) {
if (!generated_ts_.initialized) {
generated_ts_.initialized = true;
generated_ts_.next_timestamp = 0;
generated_ts_.next_step = 960;
}
nonce_from_generated_ts = true;
generated_ts_used = generated_ts_.next_timestamp;
fill_nonce_from_ts32(generated_ts_used, nonce);
}
}
if (payload_sz == 0 || frame.size() < payload_sz) {
if (encrypted_frame.size() == 0 || frame.size() < encrypted_frame.size()) {
return {Result::Status::kFailedToDecrypt, 0};
}
rosetta_xchacha20_xor(frame.data(), payload, payload_sz, nonce, key_);
bool used_generated_resync = false;
if (nonce_from_rtp_header && header_size <= encrypted_frame.size()) {
if (header_size > 0) {
memcpy(frame.data(), encrypted_frame.data(), header_size);
}
const size_t payload_size = encrypted_frame.size() - header_size;
rosetta_xchacha20_xor(
frame.data() + header_size,
encrypted_frame.data() + header_size,
payload_size,
nonce,
key_);
} else {
rosetta_xchacha20_xor(frame.data(), encrypted_frame.data(), encrypted_frame.size(), nonce, key_);
}
// additional_data on Android can be absolute RTP-ish timestamp, while
// desktop nonce source is normalized stream timestamp. If normalized
// nonce gives implausible Opus, retry with absolute additional_data.
if (!nonce_from_generated_ts &&
nonce_from_additional_data &&
encrypted_frame.size() > 0 &&
additional_data.size() >= 8) {
const uint8_t* payload_ptr = frame.data() + header_size;
const size_t payload_size = encrypted_frame.size() - header_size;
if (!is_plausible_opus_packet(payload_ptr, payload_size)) {
uint8_t nonce_abs[24] = {0};
bool abs_norm = false;
bool abs_rtp = false;
if (fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce_abs,
nullptr,
false,
&abs_norm,
&abs_rtp) &&
memcmp(nonce_abs, nonce, 24) != 0) {
if (nonce_from_rtp_header && header_size <= encrypted_frame.size()) {
if (header_size > 0) {
memcpy(frame.data(), encrypted_frame.data(), header_size);
}
rosetta_xchacha20_xor(
frame.data() + header_size,
encrypted_frame.data() + header_size,
payload_size,
nonce_abs,
key_);
} else {
rosetta_xchacha20_xor(
frame.data(),
encrypted_frame.data(),
encrypted_frame.size(),
nonce_abs,
key_);
}
payload_ptr = frame.data() + header_size;
if (is_plausible_opus_packet(payload_ptr, payload_size)) {
memcpy(nonce, nonce_abs, 24);
used_absolute_additional_fallback = true;
}
}
}
}
if (nonce_from_generated_ts) {
bool plausible = is_plausible_opus_packet(frame.data(), encrypted_frame.size());
// Recover after lost packets by probing a few forward timestamp steps.
if (!plausible) {
std::vector<uint8_t> candidate(encrypted_frame.size());
for (uint32_t i = 1; i <= 8; ++i) {
const uint32_t ts_try = generated_ts_used + generated_ts_.next_step * i;
uint8_t nonce_try[24] = {0};
fill_nonce_from_ts32(ts_try, nonce_try);
rosetta_xchacha20_xor(
candidate.data(),
encrypted_frame.data(),
encrypted_frame.size(),
nonce_try,
key_);
if (is_plausible_opus_packet(candidate.data(), candidate.size())) {
memcpy(frame.data(), candidate.data(), candidate.size());
generated_ts_used = ts_try;
used_generated_resync = true;
plausible = true;
break;
}
}
}
const uint32_t step = infer_opus_packet_duration_samples(frame.data(), encrypted_frame.size());
generated_ts_.next_step = step;
generated_ts_.next_timestamp = generated_ts_used + step;
}
// Diag: log first 3 frames
int n = diag_count_.fetch_add(1, std::memory_order_relaxed);
if (n < 3) {
LOGI("DEC frame#%d: enc_sz=%zu payload=%zu nonce=%02x%02x%02x%02x",
n, encrypted_frame.size(), payload_sz,
const char* mode = nullptr;
if (nonce_from_rtp_header) {
mode = "rtp";
} else if (nonce_from_generated_ts) {
mode = used_generated_resync ? "gen-resync" : "gen";
} else if (used_absolute_additional_fallback) {
mode = additional_was_rtp_header ? "ad-rtp-abs-fb" : "raw-abs-fb";
} else if (nonce_from_additional_data) {
mode =
additional_was_rtp_header
? (nonce_from_additional_normalized ? "ad-rtp-norm" : "ad-rtp")
: (nonce_from_additional_normalized ? "raw-norm" : "raw-abs");
} else {
mode = "raw-abs";
}
LOGI("DEC frame#%d: enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce=%02x%02x%02x%02x",
n, encrypted_frame.size(), additional_data.size(), header_size, mode,
nonce[4], nonce[5], nonce[6], nonce[7]);
diag_write("DEC frame#%d: enc_sz=%zu payload=%zu nonce[4..7]=%02x%02x%02x%02x\n",
n, encrypted_frame.size(), payload_sz,
diag_write("DEC frame#%d: enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce[4..7]=%02x%02x%02x%02x\n",
n, encrypted_frame.size(), additional_data.size(), header_size, mode,
nonce[4], nonce[5], nonce[6], nonce[7]);
}
return {Result::Status::kOk, payload_sz};
return {Result::Status::kOk, encrypted_frame.size()};
}
size_t GetMaxPlaintextByteSize(cricket::MediaType, size_t encrypted_frame_size) override {
return encrypted_frame_size; // >= actual (payload = enc - 4)
return encrypted_frame_size;
}
protected:
@@ -244,6 +700,9 @@ protected:
private:
mutable std::atomic<int> ref_{0};
mutable std::atomic<int> diag_count_{0};
mutable RtpProbeState rtp_probe_;
mutable GeneratedTsState generated_ts_;
mutable AdditionalTsState additional_ts_;
uint8_t key_[32];
};