Files
mobile-android/app/src/main/cpp/rosetta_e2ee.cpp
k1ngsterr1 480fc9a1d0
All checks were successful
Android Kernel Build / build (push) Successful in 19m17s
Add E2EE diagnostic logging for debugging call encryption
- Enable diag file for all builds (was DEBUG-only)
- Add native frame count + bad streak query methods (JNI)
- Add periodic E2EE-HEALTH log with enc/dec frame counts
- Reduce scan receivers spam (only log on state change)
- Log E2EE state on call connected
- Log when attachSender/attachReceiver skips due to missing key
2026-04-01 16:28:23 +05:00

1157 lines
48 KiB
C++

/**
* JNI bridge for Rosetta E2EE.
*
* Provides:
* 1. HSalsa20 — for nacl.box.before() compatible key derivation
* 2. XChaCha20 FrameEncryptor / FrameDecryptor — inherits DIRECTLY from
* webrtc::FrameEncryptorInterface / FrameDecryptorInterface so the
* vtable is generated by the compiler, not guessed by us.
*/
#include <jni.h>
#include <cstdint>
#include <cstring>
#include <atomic>
#include <vector>
#include <signal.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <stdarg.h>
#include <stdio.h>
#include <android/log.h>
/* WebRTC M125 interface stubs (exact copies of the real headers) */
#include "webrtc/api/crypto/frame_encryptor_interface.h"
#include "webrtc/api/crypto/frame_decryptor_interface.h"
#include "crypto.h"
#define TAG "RosettaE2EE"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
/* ── Diagnostics file — written from native, visible in crash reports ── */
static char g_diag_path[512] = {0};
static int g_diag_fd = -1;
static std::atomic<int> g_diag_event_count{0};
static constexpr int kDiagEventLimit = 4000;
static constexpr int kDiagFrameLimit = 400;
static constexpr size_t kFrameHashSampleBytes = 320;
static void diag_write(const char *fmt, ...) {
if (g_diag_fd < 0) return;
char buf[512];
va_list ap;
va_start(ap, fmt);
int n = vsnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
if (n > 0) write(g_diag_fd, buf, n);
}
static void diag_event(const char *fmt, ...) {
if (g_diag_fd < 0) return;
const int idx = g_diag_event_count.fetch_add(1, std::memory_order_relaxed);
if (idx >= kDiagEventLimit) return;
char buf[640];
va_list ap;
va_start(ap, fmt);
int n = vsnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
if (n > 0) write(g_diag_fd, buf, n);
}
static inline uint32_t fnv1a32(const uint8_t* data, size_t len, size_t sample_limit = 0) {
if (!data || len == 0) return 0;
const size_t n = (sample_limit > 0 && len > sample_limit) ? sample_limit : len;
uint32_t h = 2166136261u;
for (size_t i = 0; i < n; ++i) {
h ^= data[i];
h *= 16777619u;
}
return h;
}
static inline uint32_t key_fingerprint32(const uint8_t key[32]) {
return fnv1a32(key, 32, 32);
}
static inline uint32_t nonce_ts32(const uint8_t nonce[24]) {
return ((uint32_t)nonce[4] << 24) |
((uint32_t)nonce[5] << 16) |
((uint32_t)nonce[6] << 8) |
((uint32_t)nonce[7]);
}
static const char* media_type_name(cricket::MediaType media_type) {
switch (media_type) {
case cricket::MEDIA_TYPE_AUDIO: return "audio";
case cricket::MEDIA_TYPE_VIDEO: return "video";
case cricket::MEDIA_TYPE_DATA: return "data";
case cricket::MEDIA_TYPE_UNSUPPORTED: return "unsupported";
default: return "unknown";
}
}
/* ── RTP helpers (for cases when additional_data is empty) ───── */
struct ParsedRtpPacket {
size_t header_size = 0;
uint16_t sequence = 0;
uint32_t timestamp = 0;
uint32_t ssrc = 0;
};
struct RtpProbeState {
bool locked = false;
bool has_probe = false;
uint32_t probe_ssrc = 0;
uint16_t probe_sequence = 0;
uint32_t probe_timestamp = 0;
uint32_t ssrc = 0;
uint16_t last_sequence = 0;
uint32_t last_timestamp = 0;
};
struct GeneratedTsState {
bool initialized = false;
uint32_t next_timestamp = 0;
uint32_t next_step = 960; // 20 ms @ 48 kHz (default Opus packetization)
};
struct AdditionalTsState {
bool initialized = false;
uint64_t base_timestamp = 0;
};
struct SenderTsOffsetState {
bool initialized = false;
bool enabled = false;
uint64_t offset = 0;
};
static inline uint16_t load16_be(const uint8_t* p) {
return (uint16_t)(((uint16_t)p[0] << 8) | (uint16_t)p[1]);
}
static inline uint32_t load32_be(const uint8_t* p) {
return ((uint32_t)p[0] << 24) |
((uint32_t)p[1] << 16) |
((uint32_t)p[2] << 8) |
((uint32_t)p[3]);
}
static inline uint64_t load64_be(const uint8_t* p) {
return ((uint64_t)p[0] << 56) |
((uint64_t)p[1] << 48) |
((uint64_t)p[2] << 40) |
((uint64_t)p[3] << 32) |
((uint64_t)p[4] << 24) |
((uint64_t)p[5] << 16) |
((uint64_t)p[6] << 8) |
((uint64_t)p[7]);
}
static bool parse_rtp_packet(const uint8_t* data, size_t len, ParsedRtpPacket* out) {
if (!data || !out || len < 12) return false;
// RTP version must be 2.
const uint8_t version = (data[0] >> 6) & 0x03;
if (version != 2) return false;
const size_t csrc_count = (size_t)(data[0] & 0x0F);
const bool has_extension = (data[0] & 0x10) != 0;
size_t header = 12 + csrc_count * 4;
if (header > len) return false;
if (has_extension) {
// Extension header: 16-bit profile + 16-bit length (in 32-bit words)
if (len < header + 4) return false;
const uint16_t ext_len_words =
(uint16_t)(((uint16_t)data[header + 2] << 8) | (uint16_t)data[header + 3]);
const size_t ext_bytes = (size_t)ext_len_words * 4;
header += 4 + ext_bytes;
if (header > len) return false;
}
const size_t payload_size = len - header;
if (payload_size == 0 || payload_size > 1200) return false;
out->header_size = header;
out->sequence = load16_be(data + 2);
out->timestamp = load32_be(data + 4);
out->ssrc = load32_be(data + 8);
return true;
}
static bool fill_nonce_from_rtp_frame(const uint8_t* data,
size_t len,
RtpProbeState* state,
uint8_t nonce[24],
size_t* header_size) {
if (!state) return false;
ParsedRtpPacket packet;
if (!parse_rtp_packet(data, len, &packet)) return false;
if (!state->locked) {
if (!state->has_probe) {
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
diag_event("RTP probe-start ssrc=%u seq=%u ts=%u hdr=%zu\n",
packet.ssrc, packet.sequence, packet.timestamp, packet.header_size);
return false;
}
const bool same_ssrc = packet.ssrc == state->probe_ssrc;
const uint16_t seq_delta = (uint16_t)(packet.sequence - state->probe_sequence);
const bool sequence_progressed = seq_delta > 0 && seq_delta <= 10;
if (!same_ssrc || !sequence_progressed) {
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
state->locked = true;
state->has_probe = false;
state->ssrc = packet.ssrc;
state->last_sequence = packet.sequence;
state->last_timestamp = packet.timestamp;
diag_event("RTP probe-lock ssrc=%u seq=%u ts=%u hdr=%zu\n",
packet.ssrc, packet.sequence, packet.timestamp, packet.header_size);
} else {
if (packet.ssrc != state->ssrc) {
diag_event("RTP probe-unlock reason=ssrc-change old=%u new=%u seq=%u ts=%u\n",
state->ssrc, packet.ssrc, packet.sequence, packet.timestamp);
state->locked = false;
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
const uint16_t seq_delta = (uint16_t)(packet.sequence - state->last_sequence);
// Accept in-order packets and small jumps (packet loss).
if (seq_delta != 0 && seq_delta <= 200) {
state->last_sequence = packet.sequence;
state->last_timestamp = packet.timestamp;
} else if (seq_delta != 0) {
// Not plausible for a continuous stream: re-probe.
diag_event("RTP probe-unlock reason=seq-jump delta=%u ssrc=%u last_seq=%u seq=%u ts=%u\n",
seq_delta, packet.ssrc, state->last_sequence, packet.sequence, packet.timestamp);
state->locked = false;
state->has_probe = true;
state->probe_ssrc = packet.ssrc;
state->probe_sequence = packet.sequence;
state->probe_timestamp = packet.timestamp;
return false;
}
}
nonce[4] = (uint8_t)(packet.timestamp >> 24);
nonce[5] = (uint8_t)(packet.timestamp >> 16);
nonce[6] = (uint8_t)(packet.timestamp >> 8);
nonce[7] = (uint8_t)(packet.timestamp);
if (header_size) *header_size = packet.header_size;
return true;
}
static bool fill_nonce_from_additional_data(const uint8_t* data,
size_t len,
uint8_t nonce[24],
bool* used_rtp_header,
AdditionalTsState* ts_state,
bool* used_relative_ts) {
if (used_rtp_header) *used_rtp_header = false;
if (used_relative_ts) *used_relative_ts = false;
if (!data || len < 8) return false;
// Desktop-compatible path: additional_data contains encoded frame timestamp
// as 8-byte BE value. On Android sender/receiver can have different absolute
// base in some pipelines; ts_state enables optional relative fallback.
// Primary desktop-compatible mode uses absolute timestamp (ts_state == nullptr).
if (len == 8) {
uint64_t ts64 = load64_be(data);
uint64_t ts_rel = ts64;
if (ts_state != nullptr) {
if (!ts_state->initialized) {
ts_state->initialized = true;
ts_state->base_timestamp = ts64;
} else if (ts64 < ts_state->base_timestamp) {
// New stream or reset; avoid huge wrapped nonce drift.
ts_state->base_timestamp = ts64;
}
ts_rel = ts64 - ts_state->base_timestamp;
if (used_relative_ts) *used_relative_ts = true;
}
nonce[0] = (uint8_t)(ts_rel >> 56);
nonce[1] = (uint8_t)(ts_rel >> 48);
nonce[2] = (uint8_t)(ts_rel >> 40);
nonce[3] = (uint8_t)(ts_rel >> 32);
nonce[4] = (uint8_t)(ts_rel >> 24);
nonce[5] = (uint8_t)(ts_rel >> 16);
nonce[6] = (uint8_t)(ts_rel >> 8);
nonce[7] = (uint8_t)(ts_rel);
return true;
}
// Legacy native WebRTC layout: additional_data can be RTP header bytes.
if (len >= 12) {
const uint8_t version = (data[0] >> 6) & 0x03;
if (version == 2) {
uint32_t ts = load32_be(data + 4);
nonce[4] = (uint8_t)(ts >> 24);
nonce[5] = (uint8_t)(ts >> 16);
nonce[6] = (uint8_t)(ts >> 8);
nonce[7] = (uint8_t)(ts);
if (used_rtp_header) *used_rtp_header = true;
return true;
}
}
// Generic fallback: first 8 bytes as BE timestamp-like payload.
memcpy(nonce, data, 8);
return true;
}
static bool is_plausible_opus_packet(const uint8_t* packet, size_t len);
static bool is_plausible_decrypted_audio_frame(const uint8_t* data, size_t len) {
if (!data || len == 0) return false;
ParsedRtpPacket packet;
if (parse_rtp_packet(data, len, &packet) && packet.header_size < len) {
return is_plausible_opus_packet(data + packet.header_size, len - packet.header_size);
}
return is_plausible_opus_packet(data, len);
}
static inline void fill_nonce_from_ts32(uint32_t ts, uint8_t nonce[24]) {
nonce[4] = (uint8_t)(ts >> 24);
nonce[5] = (uint8_t)(ts >> 16);
nonce[6] = (uint8_t)(ts >> 8);
nonce[7] = (uint8_t)(ts);
}
static inline void fill_nonce_from_ts64(uint64_t ts, uint8_t nonce[24]) {
nonce[0] = (uint8_t)(ts >> 56);
nonce[1] = (uint8_t)(ts >> 48);
nonce[2] = (uint8_t)(ts >> 40);
nonce[3] = (uint8_t)(ts >> 32);
nonce[4] = (uint8_t)(ts >> 24);
nonce[5] = (uint8_t)(ts >> 16);
nonce[6] = (uint8_t)(ts >> 8);
nonce[7] = (uint8_t)(ts);
}
static inline uint64_t monotonic_48k_ticks() {
struct timespec ts {};
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) return 0;
const uint64_t sec = (uint64_t)ts.tv_sec;
const uint64_t nsec = (uint64_t)ts.tv_nsec;
return sec * 48000ULL + (nsec * 48000ULL) / 1000000000ULL;
}
static inline uint32_t opus_base_frame_samples(uint8_t config) {
// RFC 6716 TOC config mapping at 48 kHz.
if (config <= 11) {
// SILK: 10/20/40/60 ms
static const uint32_t kSilk[4] = {480, 960, 1920, 2880};
return kSilk[config & 0x03];
}
if (config <= 15) {
// Hybrid: 10/20 ms
return (config & 0x01) ? 960 : 480;
}
// CELT-only: 2.5/5/10/20 ms
static const uint32_t kCelt[4] = {120, 240, 480, 960};
return kCelt[config & 0x03];
}
static uint32_t infer_opus_packet_duration_samples(const uint8_t* packet, size_t len) {
if (!packet || len == 0) return 960;
const uint8_t toc = packet[0];
const uint8_t config = (uint8_t)(toc >> 3);
const uint8_t frame_code = (uint8_t)(toc & 0x03);
uint32_t frame_count = 1;
if (frame_code == 1 || frame_code == 2) {
frame_count = 2;
} else if (frame_code == 3) {
if (len < 2) return 960;
frame_count = (uint32_t)(packet[1] & 0x3F);
if (frame_count == 0 || frame_count > 48) return 960;
}
uint32_t base = opus_base_frame_samples(config);
uint32_t total = base * frame_count;
if (total < 120 || total > 5760) return 960;
return total;
}
static bool decode_opus_frame_len(const uint8_t* packet,
size_t len,
size_t* offset,
size_t* frame_len) {
if (!packet || !offset || !frame_len) return false;
if (*offset >= len) return false;
const uint8_t b0 = packet[*offset];
(*offset)++;
if (b0 < 252) {
*frame_len = b0;
return true;
}
if (*offset >= len) return false;
const uint8_t b1 = packet[*offset];
(*offset)++;
*frame_len = (size_t)b1 * 4u + (size_t)b0;
return true;
}
static bool is_plausible_opus_packet(const uint8_t* packet, size_t len) {
if (!packet || len == 0 || len > 2000) return false;
const uint8_t toc = packet[0];
const uint8_t config = (uint8_t)(toc >> 3);
if (config > 31) return false;
const uint8_t frame_code = (uint8_t)(toc & 0x03);
const size_t payload_len = len - 1;
if (frame_code == 0) {
// 1 frame, full payload
return payload_len >= 1;
}
if (frame_code == 1) {
// 2 CBR frames, equal sizes.
if (payload_len < 2) return false;
return (payload_len % 2) == 0;
}
if (frame_code == 2) {
// 2 VBR frames
size_t off = 1;
size_t len1 = 0;
if (!decode_opus_frame_len(packet, len, &off, &len1)) return false;
if (len1 == 0) return false;
if (off + len1 >= len) return false; // need non-empty second frame
const size_t len2 = len - off - len1;
if (len2 == 0) return false;
return true;
}
// frame_code == 3: arbitrary number of frames
if (len < 2) return false;
const uint8_t ch = packet[1];
const bool vbr = (ch & 0x80) != 0;
const bool has_padding = (ch & 0x40) != 0;
const uint8_t frame_count = (uint8_t)(ch & 0x3F);
if (frame_count == 0 || frame_count > 48) return false;
const uint32_t total = opus_base_frame_samples(config) * (uint32_t)frame_count;
if (total > 5760) return false;
// Padding bit is rarely used in live voice. Rejecting it improves
// discrimination between valid Opus and random decrypted noise.
if (has_padding) return false;
if (!vbr) {
const size_t data_len = len - 2;
if (data_len < (size_t)frame_count) return false;
return (data_len % frame_count) == 0;
}
// VBR: parse lengths for first N-1 frames; last frame consumes rest.
size_t off = 2;
size_t consumed = 0;
for (size_t i = 0; i + 1 < frame_count; ++i) {
size_t flen = 0;
if (!decode_opus_frame_len(packet, len, &off, &flen)) return false;
if (flen == 0) return false;
consumed += flen;
if (off + consumed >= len) return false;
}
const size_t remaining = len - off;
if (remaining <= consumed) return false;
const size_t last = remaining - consumed;
return last > 0;
}
/* ── Native crash handler — writes to file before dying ──────── */
static char g_crash_path[512] = {0};
static struct sigaction g_old_sigsegv = {};
static struct sigaction g_old_sigabrt = {};
static void native_crash_handler(int sig, siginfo_t *info, void *ctx) {
if (g_crash_path[0] != 0) {
int fd = open(g_crash_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd >= 0) {
const char *msg;
if (sig == SIGSEGV)
msg = "NATIVE CRASH: SIGSEGV (segmentation fault) in rosetta_e2ee\n"
"Fault address: see logcat for details.\n";
else if (sig == SIGABRT)
msg = "NATIVE CRASH: SIGABRT (abort) in rosetta_e2ee\n";
else
msg = "NATIVE CRASH: unknown signal in rosetta_e2ee\n";
write(fd, msg, strlen(msg));
close(fd);
}
}
LOGE("NATIVE CRASH sig=%d addr=%p", sig, info ? info->si_addr : nullptr);
struct sigaction *old = (sig == SIGSEGV) ? &g_old_sigsegv : &g_old_sigabrt;
sigaction(sig, old, nullptr);
raise(sig);
}
/* ════════════════════════════════════════════════════════════════
* XChaCha20 Encryptor — inherits from the REAL interface
* ════════════════════════════════════════════════════════════════ */
class XChaCha20Encryptor final : public webrtc::FrameEncryptorInterface {
public:
explicit XChaCha20Encryptor(const uint8_t key[32]) {
memcpy(key_, key, 32);
key_fingerprint_ = key_fingerprint32(key_);
LOGI("ENC init ptr=%p key_fp=%08x", this, key_fingerprint_);
diag_event("ENC init ptr=%p key_fp=%08x\n", this, key_fingerprint_);
}
uint32_t KeyFingerprint() const { return key_fingerprint_; }
int FrameCount() const { return diag_count_.load(std::memory_order_relaxed); }
/* ── RefCountInterface ─────────────────────────────────────── */
void AddRef() const override {
ref_.fetch_add(1, std::memory_order_relaxed);
}
rtc::RefCountReleaseStatus Release() const override {
if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
delete this;
return rtc::RefCountReleaseStatus::kDroppedLastRef;
}
return rtc::RefCountReleaseStatus::kOtherRefsRemained;
}
/**
* Desktop-compatible frame format: ciphertext only (no custom prefix).
*
* Nonce (24 bytes) is derived exactly like desktop:
* - nonce[0..3] = 0
* - nonce[4..7] = RTP timestamp
* - nonce[8..23] = 0
*
* Primary source of timestamp: additional_data[4..7] (if provided by WebRTC).
* Fallback (Android path where additional_data can be empty):
* parse RTP header from frame and take timestamp from frame[4..7].
*
* If RTP header is found inside frame, we leave header bytes unencrypted
* and encrypt only payload (desktop-compatible).
*/
int Encrypt(cricket::MediaType media_type,
uint32_t ssrc,
rtc::ArrayView<const uint8_t> additional_data,
rtc::ArrayView<const uint8_t> frame,
rtc::ArrayView<uint8_t> encrypted_frame,
size_t* bytes_written) override {
if (frame.size() == 0 || encrypted_frame.size() < frame.size()) {
*bytes_written = 0;
return -1;
}
size_t header_size = 0;
bool nonce_from_rtp_header = false;
bool nonce_from_generated_ts = false;
bool nonce_from_additional_data = false;
bool additional_was_rtp_header = false;
bool additional_used_mono_offset = false;
uint32_t generated_ts_used = 0;
// Build nonce from RTP timestamp in additional_data (preferred).
uint8_t nonce[24] = {0};
bool additional_used_relative_ts = false;
nonce_from_additional_data = fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce,
&additional_was_rtp_header,
nullptr,
&additional_used_relative_ts);
if (!nonce_from_additional_data) {
nonce_from_rtp_header =
fill_nonce_from_rtp_frame(frame.data(), frame.size(), &rtp_probe_, nonce, &header_size);
if (!nonce_from_rtp_header) {
if (!generated_ts_.initialized) {
generated_ts_.initialized = true;
generated_ts_.next_timestamp = 0;
generated_ts_.next_step = 960;
}
nonce_from_generated_ts = true;
generated_ts_used = generated_ts_.next_timestamp;
fill_nonce_from_ts32(generated_ts_used, nonce);
diag_event("ENC fallback=generated-ts mt=%s ssrc=%u frame_sz=%zu ad_sz=%zu gen_ts=%u\n",
media_type_name(media_type), ssrc, frame.size(), additional_data.size(), generated_ts_used);
}
}
// Some Android sender pipelines expose stream-relative ad8 timestamps
// (0, 960, 1920, ...), while desktop receiver expects an absolute base.
// For interop, add a monotonic 48k offset once when first ad8 is tiny.
if (nonce_from_additional_data &&
additional_data.size() == 8 &&
!additional_was_rtp_header &&
additional_data.data() != nullptr) {
const uint64_t ad_ts64 = load64_be(additional_data.data());
if (!sender_ts_offset_.initialized) {
sender_ts_offset_.initialized = true;
// Keep pure raw-abs mode by default; desktop is the source of truth.
sender_ts_offset_.enabled = false;
sender_ts_offset_.offset = 0ULL;
diag_event("ENC ad8-base init ssrc=%u ad_ts=%llu use_mono=%d mono_off=%llu\n",
ssrc,
(unsigned long long)ad_ts64,
sender_ts_offset_.enabled ? 1 : 0,
(unsigned long long)sender_ts_offset_.offset);
}
if (sender_ts_offset_.enabled) {
const uint64_t ts_adj = ad_ts64 + sender_ts_offset_.offset;
fill_nonce_from_ts64(ts_adj, nonce);
additional_used_mono_offset = true;
}
}
// Desktop createEncodedStreams encrypts full encoded chunk.
// To stay wire-compatible, do not preserve any leading RTP-like bytes.
rosetta_xchacha20_xor(encrypted_frame.data(),
frame.data(), frame.size(), nonce, key_);
*bytes_written = frame.size();
if (nonce_from_generated_ts) {
const uint32_t step = infer_opus_packet_duration_samples(frame.data(), frame.size());
generated_ts_.next_step = step;
generated_ts_.next_timestamp = generated_ts_used + step;
}
// Diag: log first frames with enough context for crash analysis.
int n = diag_count_.fetch_add(1, std::memory_order_relaxed);
if (n < kDiagFrameLimit) {
uint8_t ad_prefix[8] = {0};
const size_t ad_copy = additional_data.size() < sizeof(ad_prefix)
? additional_data.size()
: sizeof(ad_prefix);
if (ad_copy > 0) memcpy(ad_prefix, additional_data.data(), ad_copy);
ParsedRtpPacket rtp{};
const bool has_rtp = parse_rtp_packet(frame.data(), frame.size(), &rtp);
const bool opus_plausible =
has_rtp && rtp.header_size < frame.size()
? is_plausible_opus_packet(frame.data() + rtp.header_size, frame.size() - rtp.header_size)
: is_plausible_opus_packet(frame.data(), frame.size());
const uint32_t in_hash = fnv1a32(frame.data(), frame.size(), kFrameHashSampleBytes);
const uint32_t out_hash = fnv1a32(encrypted_frame.data(), frame.size(), kFrameHashSampleBytes);
const char* mode =
nonce_from_rtp_header
? "rtp"
: (nonce_from_generated_ts
? "gen"
: (nonce_from_additional_data
? (additional_was_rtp_header
? "ad-rtp"
: (additional_used_mono_offset
? "raw-abs+mono"
: (additional_used_relative_ts ? "raw-rel" : "raw-abs")))
: "raw-abs"));
LOGI("ENC frame#%d mt=%s ssrc=%u sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u rtp_ok=%d rtp_seq=%u rtp_ts=%u rtp_ssrc=%u opus_ok=%d key_fp=%08x in_h=%08x out_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x",
n, media_type_name(media_type), ssrc, frame.size(), additional_data.size(), header_size, mode,
nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step,
has_rtp ? 1 : 0, has_rtp ? rtp.sequence : 0, has_rtp ? rtp.timestamp : 0, has_rtp ? rtp.ssrc : 0,
opus_plausible ? 1 : 0, key_fingerprint_, in_hash, out_hash,
ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3],
ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]);
diag_write("ENC frame#%d mt=%s ssrc=%u sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u rtp_ok=%d rtp_seq=%u rtp_ts=%u rtp_ssrc=%u opus_ok=%d key_fp=%08x in_h=%08x out_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x\n",
n, media_type_name(media_type), ssrc, frame.size(), additional_data.size(), header_size, mode,
nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step,
has_rtp ? 1 : 0, has_rtp ? rtp.sequence : 0, has_rtp ? rtp.timestamp : 0, has_rtp ? rtp.ssrc : 0,
opus_plausible ? 1 : 0, key_fingerprint_, in_hash, out_hash,
ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3],
ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]);
}
return 0;
}
size_t GetMaxCiphertextByteSize(cricket::MediaType, size_t frame_size) override {
return frame_size;
}
protected:
~XChaCha20Encryptor() override {
diag_event("ENC destroy ptr=%p key_fp=%08x\n", this, key_fingerprint_);
memset(key_, 0, 32);
}
private:
mutable std::atomic<int> ref_{0};
mutable std::atomic<int> diag_count_{0};
mutable RtpProbeState rtp_probe_;
mutable GeneratedTsState generated_ts_;
mutable SenderTsOffsetState sender_ts_offset_;
uint32_t key_fingerprint_ = 0;
uint8_t key_[32];
};
/* ════════════════════════════════════════════════════════════════
* XChaCha20 Decryptor — inherits from the REAL interface
* ════════════════════════════════════════════════════════════════ */
class XChaCha20Decryptor final : public webrtc::FrameDecryptorInterface {
public:
explicit XChaCha20Decryptor(const uint8_t key[32]) {
memcpy(key_, key, 32);
key_fingerprint_ = key_fingerprint32(key_);
LOGI("DEC init ptr=%p key_fp=%08x", this, key_fingerprint_);
diag_event("DEC init ptr=%p key_fp=%08x\n", this, key_fingerprint_);
}
uint32_t KeyFingerprint() const { return key_fingerprint_; }
int FrameCount() const { return diag_count_.load(std::memory_order_relaxed); }
uint32_t BadStreak() const { return bad_audio_streak_.load(std::memory_order_relaxed); }
/* ── RefCountInterface ─────────────────────────────────────── */
void AddRef() const override {
ref_.fetch_add(1, std::memory_order_relaxed);
}
rtc::RefCountReleaseStatus Release() const override {
if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
delete this;
return rtc::RefCountReleaseStatus::kDroppedLastRef;
}
return rtc::RefCountReleaseStatus::kOtherRefsRemained;
}
/**
* Desktop-compatible decrypt:
* - nonce from RTP timestamp
* - if RTP header is present inside encrypted_frame (fallback path),
* keep header bytes untouched and decrypt payload only.
*/
Result Decrypt(cricket::MediaType media_type,
const std::vector<uint32_t>& csrcs,
rtc::ArrayView<const uint8_t> additional_data,
rtc::ArrayView<const uint8_t> encrypted_frame,
rtc::ArrayView<uint8_t> frame) override {
uint8_t nonce[24] = {0};
size_t header_size = 0;
bool nonce_from_rtp_header = false;
bool nonce_from_generated_ts = false;
bool nonce_from_additional_data = false;
bool additional_was_rtp_header = false;
bool additional_used_relative_ts = false;
bool used_additional_relative_fallback = false;
bool used_plain_passthrough = false;
uint32_t generated_ts_used = 0;
nonce_from_additional_data = fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce,
&additional_was_rtp_header,
nullptr,
&additional_used_relative_ts);
if (!nonce_from_additional_data) {
nonce_from_rtp_header =
fill_nonce_from_rtp_frame(encrypted_frame.data(), encrypted_frame.size(), &rtp_probe_, nonce, &header_size);
if (!nonce_from_rtp_header) {
if (!generated_ts_.initialized) {
generated_ts_.initialized = true;
generated_ts_.next_timestamp = 0;
generated_ts_.next_step = 960;
}
nonce_from_generated_ts = true;
generated_ts_used = generated_ts_.next_timestamp;
fill_nonce_from_ts32(generated_ts_used, nonce);
diag_event("DEC fallback=generated-ts mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu gen_ts=%u\n",
media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), generated_ts_used);
}
}
if (encrypted_frame.size() == 0 || frame.size() < encrypted_frame.size()) {
return {Result::Status::kFailedToDecrypt, 0};
}
bool used_generated_resync = false;
// Desktop createEncodedStreams decrypts full encoded chunk.
rosetta_xchacha20_xor(frame.data(), encrypted_frame.data(), encrypted_frame.size(), nonce, key_);
if (nonce_from_additional_data) {
bool plausible = is_plausible_decrypted_audio_frame(frame.data(), encrypted_frame.size());
// Fallback for Android pipelines where additional_data timestamps are
// stream-relative while remote side uses a different absolute base.
if (!plausible && additional_data.size() == 8 && !additional_was_rtp_header) {
uint8_t nonce_rel[24] = {0};
bool rel_used = false;
if (fill_nonce_from_additional_data(
additional_data.data(),
additional_data.size(),
nonce_rel,
nullptr,
&additional_rel_ts_state_,
&rel_used) &&
rel_used) {
std::vector<uint8_t> candidate(encrypted_frame.size());
rosetta_xchacha20_xor(
candidate.data(),
encrypted_frame.data(),
encrypted_frame.size(),
nonce_rel,
key_);
if (is_plausible_decrypted_audio_frame(candidate.data(), candidate.size())) {
memcpy(frame.data(), candidate.data(), candidate.size());
memcpy(nonce, nonce_rel, sizeof(nonce));
plausible = true;
used_additional_relative_fallback = true;
additional_used_relative_ts = true;
diag_event("DEC fallback=relative-ad-ts mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu nonce_ts=%u\n",
media_type_name(media_type), csrcs.size(), encrypted_frame.size(),
additional_data.size(), nonce_ts32(nonce));
}
}
}
// If payload already looks like valid Opus, keep plaintext.
// This protects interop when peer stream is unexpectedly unencrypted.
if (!plausible &&
is_plausible_decrypted_audio_frame(encrypted_frame.data(), encrypted_frame.size())) {
memcpy(frame.data(), encrypted_frame.data(), encrypted_frame.size());
used_plain_passthrough = true;
diag_event("DEC fallback=plain-passthrough mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu\n",
media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size());
}
}
if (nonce_from_generated_ts) {
bool plausible = is_plausible_opus_packet(frame.data(), encrypted_frame.size());
// Recover after lost packets by probing a few forward timestamp steps.
if (!plausible) {
std::vector<uint8_t> candidate(encrypted_frame.size());
for (uint32_t i = 1; i <= 8; ++i) {
const uint32_t ts_try = generated_ts_used + generated_ts_.next_step * i;
uint8_t nonce_try[24] = {0};
fill_nonce_from_ts32(ts_try, nonce_try);
rosetta_xchacha20_xor(
candidate.data(),
encrypted_frame.data(),
encrypted_frame.size(),
nonce_try,
key_);
if (is_plausible_opus_packet(candidate.data(), candidate.size())) {
memcpy(frame.data(), candidate.data(), candidate.size());
generated_ts_used = ts_try;
used_generated_resync = true;
plausible = true;
diag_event("DEC fallback=generated-resync mt=%s csrcs=%zu enc_sz=%zu ts_try=%u step=%u probe=%u\n",
media_type_name(media_type), csrcs.size(), encrypted_frame.size(),
ts_try, generated_ts_.next_step, i);
break;
}
}
}
const uint32_t step = infer_opus_packet_duration_samples(frame.data(), encrypted_frame.size());
generated_ts_.next_step = step;
generated_ts_.next_timestamp = generated_ts_used + step;
}
// Diag: log first frames with enough context for crash analysis.
int n = diag_count_.fetch_add(1, std::memory_order_relaxed);
if (n < kDiagFrameLimit) {
uint8_t ad_prefix[8] = {0};
const size_t ad_copy = additional_data.size() < sizeof(ad_prefix)
? additional_data.size()
: sizeof(ad_prefix);
if (ad_copy > 0) memcpy(ad_prefix, additional_data.data(), ad_copy);
ParsedRtpPacket enc_rtp{};
ParsedRtpPacket dec_rtp{};
const bool has_enc_rtp = parse_rtp_packet(encrypted_frame.data(), encrypted_frame.size(), &enc_rtp);
const bool has_dec_rtp = parse_rtp_packet(frame.data(), encrypted_frame.size(), &dec_rtp);
const bool dec_plausible = is_plausible_decrypted_audio_frame(frame.data(), encrypted_frame.size());
const uint32_t enc_hash = fnv1a32(encrypted_frame.data(), encrypted_frame.size(), kFrameHashSampleBytes);
const uint32_t dec_hash = fnv1a32(frame.data(), encrypted_frame.size(), kFrameHashSampleBytes);
const char* mode = nullptr;
if (nonce_from_rtp_header) {
mode = "rtp";
} else if (nonce_from_generated_ts) {
mode = used_generated_resync ? "gen-resync" : "gen";
} else if (nonce_from_additional_data) {
if (used_plain_passthrough) mode = "raw-plain";
else if (additional_was_rtp_header) mode = "ad-rtp";
else if (used_additional_relative_fallback) mode = "raw-rel-fb";
else mode = additional_used_relative_ts ? "raw-rel" : "raw-abs";
} else {
mode = "raw-abs";
}
uint32_t bad_streak = 0;
if (!dec_plausible) {
bad_streak = bad_audio_streak_.fetch_add(1, std::memory_order_relaxed) + 1;
if (bad_streak == 1 || bad_streak == 3 || bad_streak == 10 || (bad_streak % 50) == 0) {
diag_event("DEC degraded mt=%s csrcs=%zu mode=%s bad_streak=%u nonce_ts=%u key_fp=%08x\n",
media_type_name(media_type), csrcs.size(), mode, bad_streak,
nonce_ts32(nonce), key_fingerprint_);
}
} else {
const uint32_t prev_bad = bad_audio_streak_.exchange(0, std::memory_order_relaxed);
if (prev_bad >= 3) {
diag_event("DEC recovered mt=%s csrcs=%zu mode=%s prev_bad_streak=%u nonce_ts=%u key_fp=%08x\n",
media_type_name(media_type), csrcs.size(), mode, prev_bad,
nonce_ts32(nonce), key_fingerprint_);
}
}
LOGI("DEC frame#%d mt=%s csrcs=%zu enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u dec_ok=%d bad_streak=%u enc_rtp=%d enc_seq=%u enc_ts=%u enc_ssrc=%u dec_rtp=%d dec_seq=%u dec_ts=%u dec_ssrc=%u key_fp=%08x enc_h=%08x dec_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x",
n, media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), header_size, mode,
nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step,
dec_plausible ? 1 : 0, bad_streak,
has_enc_rtp ? 1 : 0, has_enc_rtp ? enc_rtp.sequence : 0, has_enc_rtp ? enc_rtp.timestamp : 0, has_enc_rtp ? enc_rtp.ssrc : 0,
has_dec_rtp ? 1 : 0, has_dec_rtp ? dec_rtp.sequence : 0, has_dec_rtp ? dec_rtp.timestamp : 0, has_dec_rtp ? dec_rtp.ssrc : 0,
key_fingerprint_, enc_hash, dec_hash,
ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3],
ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]);
diag_write("DEC frame#%d mt=%s csrcs=%zu enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u dec_ok=%d bad_streak=%u enc_rtp=%d enc_seq=%u enc_ts=%u enc_ssrc=%u dec_rtp=%d dec_seq=%u dec_ts=%u dec_ssrc=%u key_fp=%08x enc_h=%08x dec_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x\n",
n, media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), header_size, mode,
nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step,
dec_plausible ? 1 : 0, bad_streak,
has_enc_rtp ? 1 : 0, has_enc_rtp ? enc_rtp.sequence : 0, has_enc_rtp ? enc_rtp.timestamp : 0, has_enc_rtp ? enc_rtp.ssrc : 0,
has_dec_rtp ? 1 : 0, has_dec_rtp ? dec_rtp.sequence : 0, has_dec_rtp ? dec_rtp.timestamp : 0, has_dec_rtp ? dec_rtp.ssrc : 0,
key_fingerprint_, enc_hash, dec_hash,
ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3],
ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]);
}
return {Result::Status::kOk, encrypted_frame.size()};
}
size_t GetMaxPlaintextByteSize(cricket::MediaType, size_t encrypted_frame_size) override {
return encrypted_frame_size;
}
protected:
~XChaCha20Decryptor() override {
diag_event("DEC destroy ptr=%p key_fp=%08x bad_streak=%u\n",
this, key_fingerprint_, bad_audio_streak_.load(std::memory_order_relaxed));
memset(key_, 0, 32);
}
private:
mutable std::atomic<int> ref_{0};
mutable std::atomic<int> diag_count_{0};
mutable std::atomic<uint32_t> bad_audio_streak_{0};
mutable RtpProbeState rtp_probe_;
mutable GeneratedTsState generated_ts_;
mutable AdditionalTsState additional_rel_ts_state_;
uint32_t key_fingerprint_ = 0;
uint8_t key_[32];
};
/* ════════════════════════════════════════════════════════════════
* JNI exports
* ════════════════════════════════════════════════════════════════ */
extern "C" {
/* ── HSalsa20 for nacl.box.before() ──────────────────────────── */
JNIEXPORT jbyteArray JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeHSalsa20(
JNIEnv *env, jclass, jbyteArray jRawDh)
{
jsize len = env->GetArrayLength(jRawDh);
if (len < 32) return nullptr;
auto *raw = (uint8_t *)env->GetByteArrayElements(jRawDh, nullptr);
uint8_t out[32];
uint8_t zeros[16] = {0};
rosetta_hsalsa20(out, zeros, raw);
env->ReleaseByteArrayElements(jRawDh, (jbyte *)raw, JNI_ABORT);
jbyteArray result = env->NewByteArray(32);
env->SetByteArrayRegion(result, 0, 32, (jbyte *)out);
memset(out, 0, 32);
return result;
}
/* ── Create / destroy encryptor ──────────────────────────────── */
JNIEXPORT jlong JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCreateEncryptor(
JNIEnv *env, jclass, jbyteArray jKey)
{
jsize len = env->GetArrayLength(jKey);
if (len < 32) {
LOGE("Create encryptor failed: key length=%d (<32)", (int)len);
diag_event("ENC create-failed key_len=%d\n", (int)len);
return 0;
}
auto *key = (uint8_t *)env->GetByteArrayElements(jKey, nullptr);
const uint32_t key_fp = key_fingerprint32(key);
auto *enc = new XChaCha20Encryptor(key);
env->ReleaseByteArrayElements(jKey, (jbyte *)key, JNI_ABORT);
// AddRef so the pointer we hand out has ref=1.
// WebRTC's scoped_refptr will AddRef again when it takes ownership.
enc->AddRef();
LOGI("Created XChaCha20 encryptor %p key_fp=%08x", enc, key_fp);
diag_event("ENC created ptr=%p key_fp=%08x key_len=%d\n", enc, key_fp, (int)len);
return reinterpret_cast<jlong>(enc);
}
JNIEXPORT void JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeReleaseEncryptor(
JNIEnv *, jclass, jlong ptr)
{
if (ptr == 0) return;
auto *enc = reinterpret_cast<XChaCha20Encryptor *>(ptr);
LOGI("Release XChaCha20 encryptor %p key_fp=%08x", enc, enc->KeyFingerprint());
diag_event("ENC release ptr=%p key_fp=%08x\n", enc, enc->KeyFingerprint());
enc->Release();
}
/* ── Create / destroy decryptor ──────────────────────────────── */
JNIEXPORT jlong JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCreateDecryptor(
JNIEnv *env, jclass, jbyteArray jKey)
{
jsize len = env->GetArrayLength(jKey);
if (len < 32) {
LOGE("Create decryptor failed: key length=%d (<32)", (int)len);
diag_event("DEC create-failed key_len=%d\n", (int)len);
return 0;
}
auto *key = (uint8_t *)env->GetByteArrayElements(jKey, nullptr);
const uint32_t key_fp = key_fingerprint32(key);
auto *dec = new XChaCha20Decryptor(key);
env->ReleaseByteArrayElements(jKey, (jbyte *)key, JNI_ABORT);
dec->AddRef();
LOGI("Created XChaCha20 decryptor %p key_fp=%08x", dec, key_fp);
diag_event("DEC created ptr=%p key_fp=%08x key_len=%d\n", dec, key_fp, (int)len);
return reinterpret_cast<jlong>(dec);
}
JNIEXPORT void JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeReleaseDecryptor(
JNIEnv *, jclass, jlong ptr)
{
if (ptr == 0) return;
auto *dec = reinterpret_cast<XChaCha20Decryptor *>(ptr);
LOGI("Release XChaCha20 decryptor %p key_fp=%08x", dec, dec->KeyFingerprint());
diag_event("DEC release ptr=%p key_fp=%08x\n", dec, dec->KeyFingerprint());
dec->Release();
}
/* ── Install native crash handler ─────────────────────────────── */
JNIEXPORT void JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeInstallCrashHandler(
JNIEnv *env, jclass, jstring jPath)
{
const char *path = env->GetStringUTFChars(jPath, nullptr);
strncpy(g_crash_path, path, sizeof(g_crash_path) - 1);
env->ReleaseStringUTFChars(jPath, path);
struct sigaction sa = {};
sa.sa_sigaction = native_crash_handler;
sa.sa_flags = SA_SIGINFO;
sigemptyset(&sa.sa_mask);
sigaction(SIGSEGV, &sa, &g_old_sigsegv);
sigaction(SIGABRT, &sa, &g_old_sigabrt);
LOGI("Native crash handler installed, path=%s", g_crash_path);
}
/* ── Open diagnostics file for E2EE frame logging ────────────── */
JNIEXPORT void JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeOpenDiagFile(
JNIEnv *env, jclass, jstring jPath)
{
if (g_diag_fd >= 0) { close(g_diag_fd); g_diag_fd = -1; }
g_diag_event_count.store(0, std::memory_order_relaxed);
const char *path = env->GetStringUTFChars(jPath, nullptr);
strncpy(g_diag_path, path, sizeof(g_diag_path) - 1);
g_diag_fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
env->ReleaseStringUTFChars(jPath, path);
if (g_diag_fd >= 0) {
diag_write("=== E2EE DIAGNOSTICS pid=%d ===\n", (int)getpid());
LOGI("Diag file opened: %s", g_diag_path);
diag_event("DIAG open path=%s\n", g_diag_path);
} else {
LOGE("Failed to open diag file: %s", g_diag_path);
}
}
JNIEXPORT void JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCloseDiagFile(
JNIEnv *, jclass)
{
if (g_diag_fd >= 0) {
diag_event("DIAG close path=%s\n", g_diag_path);
diag_write("=== END ===\n");
close(g_diag_fd);
g_diag_fd = -1;
}
}
/* ── Query frame counts for health checks ────────────────────── */
JNIEXPORT jint JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetEncryptorFrameCount(
JNIEnv *, jclass, jlong ptr)
{
if (ptr == 0) return -1;
auto *enc = reinterpret_cast<XChaCha20Encryptor *>(ptr);
return enc->FrameCount();
}
JNIEXPORT jint JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetDecryptorFrameCount(
JNIEnv *, jclass, jlong ptr)
{
if (ptr == 0) return -1;
auto *dec = reinterpret_cast<XChaCha20Decryptor *>(ptr);
return dec->FrameCount();
}
JNIEXPORT jint JNICALL
Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetDecryptorBadStreak(
JNIEnv *, jclass, jlong ptr)
{
if (ptr == 0) return -1;
auto *dec = reinterpret_cast<XChaCha20Decryptor *>(ptr);
return dec->BadStreak();
}
} /* extern "C" */