/** * JNI bridge for Rosetta E2EE. * * Provides: * 1. HSalsa20 — for nacl.box.before() compatible key derivation * 2. XChaCha20 FrameEncryptor / FrameDecryptor — inherits DIRECTLY from * webrtc::FrameEncryptorInterface / FrameDecryptorInterface so the * vtable is generated by the compiler, not guessed by us. */ #include #include #include #include #include #include #include #include #include #include #include #include /* WebRTC M125 interface stubs (exact copies of the real headers) */ #include "webrtc/api/crypto/frame_encryptor_interface.h" #include "webrtc/api/crypto/frame_decryptor_interface.h" #include "crypto.h" #define TAG "RosettaE2EE" #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__) /* ── Diagnostics file — written from native, visible in crash reports ── */ static char g_diag_path[512] = {0}; static int g_diag_fd = -1; static std::atomic g_diag_event_count{0}; static constexpr int kDiagEventLimit = 4000; static constexpr int kDiagFrameLimit = 400; static constexpr size_t kFrameHashSampleBytes = 320; static void diag_write(const char *fmt, ...) { if (g_diag_fd < 0) return; char buf[512]; va_list ap; va_start(ap, fmt); int n = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); if (n > 0) write(g_diag_fd, buf, n); } static void diag_event(const char *fmt, ...) { if (g_diag_fd < 0) return; const int idx = g_diag_event_count.fetch_add(1, std::memory_order_relaxed); if (idx >= kDiagEventLimit) return; char buf[640]; va_list ap; va_start(ap, fmt); int n = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); if (n > 0) write(g_diag_fd, buf, n); } static inline uint32_t fnv1a32(const uint8_t* data, size_t len, size_t sample_limit = 0) { if (!data || len == 0) return 0; const size_t n = (sample_limit > 0 && len > sample_limit) ? sample_limit : len; uint32_t h = 2166136261u; for (size_t i = 0; i < n; ++i) { h ^= data[i]; h *= 16777619u; } return h; } static inline uint32_t key_fingerprint32(const uint8_t key[32]) { return fnv1a32(key, 32, 32); } static inline uint32_t nonce_ts32(const uint8_t nonce[24]) { return ((uint32_t)nonce[4] << 24) | ((uint32_t)nonce[5] << 16) | ((uint32_t)nonce[6] << 8) | ((uint32_t)nonce[7]); } static const char* media_type_name(cricket::MediaType media_type) { switch (media_type) { case cricket::MEDIA_TYPE_AUDIO: return "audio"; case cricket::MEDIA_TYPE_VIDEO: return "video"; case cricket::MEDIA_TYPE_DATA: return "data"; case cricket::MEDIA_TYPE_UNSUPPORTED: return "unsupported"; default: return "unknown"; } } /* ── RTP helpers (for cases when additional_data is empty) ───── */ struct ParsedRtpPacket { size_t header_size = 0; uint16_t sequence = 0; uint32_t timestamp = 0; uint32_t ssrc = 0; }; struct RtpProbeState { bool locked = false; bool has_probe = false; uint32_t probe_ssrc = 0; uint16_t probe_sequence = 0; uint32_t probe_timestamp = 0; uint32_t ssrc = 0; uint16_t last_sequence = 0; uint32_t last_timestamp = 0; }; struct GeneratedTsState { bool initialized = false; uint32_t next_timestamp = 0; uint32_t next_step = 960; // 20 ms @ 48 kHz (default Opus packetization) }; struct AdditionalTsState { bool initialized = false; uint64_t base_timestamp = 0; }; struct SenderTsOffsetState { bool initialized = false; bool enabled = false; uint64_t offset = 0; }; static inline uint16_t load16_be(const uint8_t* p) { return (uint16_t)(((uint16_t)p[0] << 8) | (uint16_t)p[1]); } static inline uint32_t load32_be(const uint8_t* p) { return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | ((uint32_t)p[2] << 8) | ((uint32_t)p[3]); } static inline uint64_t load64_be(const uint8_t* p) { return ((uint64_t)p[0] << 56) | ((uint64_t)p[1] << 48) | ((uint64_t)p[2] << 40) | ((uint64_t)p[3] << 32) | ((uint64_t)p[4] << 24) | ((uint64_t)p[5] << 16) | ((uint64_t)p[6] << 8) | ((uint64_t)p[7]); } static bool parse_rtp_packet(const uint8_t* data, size_t len, ParsedRtpPacket* out) { if (!data || !out || len < 12) return false; // RTP version must be 2. const uint8_t version = (data[0] >> 6) & 0x03; if (version != 2) return false; const size_t csrc_count = (size_t)(data[0] & 0x0F); const bool has_extension = (data[0] & 0x10) != 0; size_t header = 12 + csrc_count * 4; if (header > len) return false; if (has_extension) { // Extension header: 16-bit profile + 16-bit length (in 32-bit words) if (len < header + 4) return false; const uint16_t ext_len_words = (uint16_t)(((uint16_t)data[header + 2] << 8) | (uint16_t)data[header + 3]); const size_t ext_bytes = (size_t)ext_len_words * 4; header += 4 + ext_bytes; if (header > len) return false; } const size_t payload_size = len - header; if (payload_size == 0 || payload_size > 1200) return false; out->header_size = header; out->sequence = load16_be(data + 2); out->timestamp = load32_be(data + 4); out->ssrc = load32_be(data + 8); return true; } static bool fill_nonce_from_rtp_frame(const uint8_t* data, size_t len, RtpProbeState* state, uint8_t nonce[24], size_t* header_size) { if (!state) return false; ParsedRtpPacket packet; if (!parse_rtp_packet(data, len, &packet)) return false; if (!state->locked) { if (!state->has_probe) { state->has_probe = true; state->probe_ssrc = packet.ssrc; state->probe_sequence = packet.sequence; state->probe_timestamp = packet.timestamp; diag_event("RTP probe-start ssrc=%u seq=%u ts=%u hdr=%zu\n", packet.ssrc, packet.sequence, packet.timestamp, packet.header_size); return false; } const bool same_ssrc = packet.ssrc == state->probe_ssrc; const uint16_t seq_delta = (uint16_t)(packet.sequence - state->probe_sequence); const bool sequence_progressed = seq_delta > 0 && seq_delta <= 10; if (!same_ssrc || !sequence_progressed) { state->probe_ssrc = packet.ssrc; state->probe_sequence = packet.sequence; state->probe_timestamp = packet.timestamp; return false; } state->locked = true; state->has_probe = false; state->ssrc = packet.ssrc; state->last_sequence = packet.sequence; state->last_timestamp = packet.timestamp; diag_event("RTP probe-lock ssrc=%u seq=%u ts=%u hdr=%zu\n", packet.ssrc, packet.sequence, packet.timestamp, packet.header_size); } else { if (packet.ssrc != state->ssrc) { diag_event("RTP probe-unlock reason=ssrc-change old=%u new=%u seq=%u ts=%u\n", state->ssrc, packet.ssrc, packet.sequence, packet.timestamp); state->locked = false; state->has_probe = true; state->probe_ssrc = packet.ssrc; state->probe_sequence = packet.sequence; state->probe_timestamp = packet.timestamp; return false; } const uint16_t seq_delta = (uint16_t)(packet.sequence - state->last_sequence); // Accept in-order packets and small jumps (packet loss). if (seq_delta != 0 && seq_delta <= 200) { state->last_sequence = packet.sequence; state->last_timestamp = packet.timestamp; } else if (seq_delta != 0) { // Not plausible for a continuous stream: re-probe. diag_event("RTP probe-unlock reason=seq-jump delta=%u ssrc=%u last_seq=%u seq=%u ts=%u\n", seq_delta, packet.ssrc, state->last_sequence, packet.sequence, packet.timestamp); state->locked = false; state->has_probe = true; state->probe_ssrc = packet.ssrc; state->probe_sequence = packet.sequence; state->probe_timestamp = packet.timestamp; return false; } } nonce[4] = (uint8_t)(packet.timestamp >> 24); nonce[5] = (uint8_t)(packet.timestamp >> 16); nonce[6] = (uint8_t)(packet.timestamp >> 8); nonce[7] = (uint8_t)(packet.timestamp); if (header_size) *header_size = packet.header_size; return true; } static bool fill_nonce_from_additional_data(const uint8_t* data, size_t len, uint8_t nonce[24], bool* used_rtp_header, AdditionalTsState* ts_state, bool* used_relative_ts) { if (used_rtp_header) *used_rtp_header = false; if (used_relative_ts) *used_relative_ts = false; if (!data || len < 8) return false; // Desktop-compatible path: additional_data contains encoded frame timestamp // as 8-byte BE value. On Android sender/receiver can have different absolute // base in some pipelines; ts_state enables optional relative fallback. // Primary desktop-compatible mode uses absolute timestamp (ts_state == nullptr). if (len == 8) { uint64_t ts64 = load64_be(data); uint64_t ts_rel = ts64; if (ts_state != nullptr) { if (!ts_state->initialized) { ts_state->initialized = true; ts_state->base_timestamp = ts64; } else if (ts64 < ts_state->base_timestamp) { // New stream or reset; avoid huge wrapped nonce drift. ts_state->base_timestamp = ts64; } ts_rel = ts64 - ts_state->base_timestamp; if (used_relative_ts) *used_relative_ts = true; } nonce[0] = (uint8_t)(ts_rel >> 56); nonce[1] = (uint8_t)(ts_rel >> 48); nonce[2] = (uint8_t)(ts_rel >> 40); nonce[3] = (uint8_t)(ts_rel >> 32); nonce[4] = (uint8_t)(ts_rel >> 24); nonce[5] = (uint8_t)(ts_rel >> 16); nonce[6] = (uint8_t)(ts_rel >> 8); nonce[7] = (uint8_t)(ts_rel); return true; } // Legacy native WebRTC layout: additional_data can be RTP header bytes. if (len >= 12) { const uint8_t version = (data[0] >> 6) & 0x03; if (version == 2) { uint32_t ts = load32_be(data + 4); nonce[4] = (uint8_t)(ts >> 24); nonce[5] = (uint8_t)(ts >> 16); nonce[6] = (uint8_t)(ts >> 8); nonce[7] = (uint8_t)(ts); if (used_rtp_header) *used_rtp_header = true; return true; } } // Generic fallback: first 8 bytes as BE timestamp-like payload. memcpy(nonce, data, 8); return true; } static bool is_plausible_opus_packet(const uint8_t* packet, size_t len); static bool is_plausible_decrypted_audio_frame(const uint8_t* data, size_t len) { if (!data || len == 0) return false; ParsedRtpPacket packet; if (parse_rtp_packet(data, len, &packet) && packet.header_size < len) { return is_plausible_opus_packet(data + packet.header_size, len - packet.header_size); } return is_plausible_opus_packet(data, len); } static inline void fill_nonce_from_ts32(uint32_t ts, uint8_t nonce[24]) { nonce[4] = (uint8_t)(ts >> 24); nonce[5] = (uint8_t)(ts >> 16); nonce[6] = (uint8_t)(ts >> 8); nonce[7] = (uint8_t)(ts); } static inline void fill_nonce_from_ts64(uint64_t ts, uint8_t nonce[24]) { nonce[0] = (uint8_t)(ts >> 56); nonce[1] = (uint8_t)(ts >> 48); nonce[2] = (uint8_t)(ts >> 40); nonce[3] = (uint8_t)(ts >> 32); nonce[4] = (uint8_t)(ts >> 24); nonce[5] = (uint8_t)(ts >> 16); nonce[6] = (uint8_t)(ts >> 8); nonce[7] = (uint8_t)(ts); } static inline uint64_t monotonic_48k_ticks() { struct timespec ts {}; if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) return 0; const uint64_t sec = (uint64_t)ts.tv_sec; const uint64_t nsec = (uint64_t)ts.tv_nsec; return sec * 48000ULL + (nsec * 48000ULL) / 1000000000ULL; } static inline uint32_t opus_base_frame_samples(uint8_t config) { // RFC 6716 TOC config mapping at 48 kHz. if (config <= 11) { // SILK: 10/20/40/60 ms static const uint32_t kSilk[4] = {480, 960, 1920, 2880}; return kSilk[config & 0x03]; } if (config <= 15) { // Hybrid: 10/20 ms return (config & 0x01) ? 960 : 480; } // CELT-only: 2.5/5/10/20 ms static const uint32_t kCelt[4] = {120, 240, 480, 960}; return kCelt[config & 0x03]; } static uint32_t infer_opus_packet_duration_samples(const uint8_t* packet, size_t len) { if (!packet || len == 0) return 960; const uint8_t toc = packet[0]; const uint8_t config = (uint8_t)(toc >> 3); const uint8_t frame_code = (uint8_t)(toc & 0x03); uint32_t frame_count = 1; if (frame_code == 1 || frame_code == 2) { frame_count = 2; } else if (frame_code == 3) { if (len < 2) return 960; frame_count = (uint32_t)(packet[1] & 0x3F); if (frame_count == 0 || frame_count > 48) return 960; } uint32_t base = opus_base_frame_samples(config); uint32_t total = base * frame_count; if (total < 120 || total > 5760) return 960; return total; } static bool decode_opus_frame_len(const uint8_t* packet, size_t len, size_t* offset, size_t* frame_len) { if (!packet || !offset || !frame_len) return false; if (*offset >= len) return false; const uint8_t b0 = packet[*offset]; (*offset)++; if (b0 < 252) { *frame_len = b0; return true; } if (*offset >= len) return false; const uint8_t b1 = packet[*offset]; (*offset)++; *frame_len = (size_t)b1 * 4u + (size_t)b0; return true; } static bool is_plausible_opus_packet(const uint8_t* packet, size_t len) { if (!packet || len == 0 || len > 2000) return false; const uint8_t toc = packet[0]; const uint8_t config = (uint8_t)(toc >> 3); if (config > 31) return false; const uint8_t frame_code = (uint8_t)(toc & 0x03); const size_t payload_len = len - 1; if (frame_code == 0) { // 1 frame, full payload return payload_len >= 1; } if (frame_code == 1) { // 2 CBR frames, equal sizes. if (payload_len < 2) return false; return (payload_len % 2) == 0; } if (frame_code == 2) { // 2 VBR frames size_t off = 1; size_t len1 = 0; if (!decode_opus_frame_len(packet, len, &off, &len1)) return false; if (len1 == 0) return false; if (off + len1 >= len) return false; // need non-empty second frame const size_t len2 = len - off - len1; if (len2 == 0) return false; return true; } // frame_code == 3: arbitrary number of frames if (len < 2) return false; const uint8_t ch = packet[1]; const bool vbr = (ch & 0x80) != 0; const bool has_padding = (ch & 0x40) != 0; const uint8_t frame_count = (uint8_t)(ch & 0x3F); if (frame_count == 0 || frame_count > 48) return false; const uint32_t total = opus_base_frame_samples(config) * (uint32_t)frame_count; if (total > 5760) return false; // Padding bit is rarely used in live voice. Rejecting it improves // discrimination between valid Opus and random decrypted noise. if (has_padding) return false; if (!vbr) { const size_t data_len = len - 2; if (data_len < (size_t)frame_count) return false; return (data_len % frame_count) == 0; } // VBR: parse lengths for first N-1 frames; last frame consumes rest. size_t off = 2; size_t consumed = 0; for (size_t i = 0; i + 1 < frame_count; ++i) { size_t flen = 0; if (!decode_opus_frame_len(packet, len, &off, &flen)) return false; if (flen == 0) return false; consumed += flen; if (off + consumed >= len) return false; } const size_t remaining = len - off; if (remaining <= consumed) return false; const size_t last = remaining - consumed; return last > 0; } /* ── Native crash handler — writes to file before dying ──────── */ static char g_crash_path[512] = {0}; static struct sigaction g_old_sigsegv = {}; static struct sigaction g_old_sigabrt = {}; static void native_crash_handler(int sig, siginfo_t *info, void *ctx) { if (g_crash_path[0] != 0) { int fd = open(g_crash_path, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd >= 0) { const char *msg; if (sig == SIGSEGV) msg = "NATIVE CRASH: SIGSEGV (segmentation fault) in rosetta_e2ee\n" "Fault address: see logcat for details.\n"; else if (sig == SIGABRT) msg = "NATIVE CRASH: SIGABRT (abort) in rosetta_e2ee\n"; else msg = "NATIVE CRASH: unknown signal in rosetta_e2ee\n"; write(fd, msg, strlen(msg)); close(fd); } } LOGE("NATIVE CRASH sig=%d addr=%p", sig, info ? info->si_addr : nullptr); struct sigaction *old = (sig == SIGSEGV) ? &g_old_sigsegv : &g_old_sigabrt; sigaction(sig, old, nullptr); raise(sig); } /* ════════════════════════════════════════════════════════════════ * XChaCha20 Encryptor — inherits from the REAL interface * ════════════════════════════════════════════════════════════════ */ class XChaCha20Encryptor final : public webrtc::FrameEncryptorInterface { public: explicit XChaCha20Encryptor(const uint8_t key[32]) { memcpy(key_, key, 32); key_fingerprint_ = key_fingerprint32(key_); LOGI("ENC init ptr=%p key_fp=%08x", this, key_fingerprint_); diag_event("ENC init ptr=%p key_fp=%08x\n", this, key_fingerprint_); } uint32_t KeyFingerprint() const { return key_fingerprint_; } int FrameCount() const { return diag_count_.load(std::memory_order_relaxed); } /* ── RefCountInterface ─────────────────────────────────────── */ void AddRef() const override { ref_.fetch_add(1, std::memory_order_relaxed); } rtc::RefCountReleaseStatus Release() const override { if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) { delete this; return rtc::RefCountReleaseStatus::kDroppedLastRef; } return rtc::RefCountReleaseStatus::kOtherRefsRemained; } /** * Desktop-compatible frame format: ciphertext only (no custom prefix). * * Nonce (24 bytes) is derived exactly like desktop: * - nonce[0..3] = 0 * - nonce[4..7] = RTP timestamp * - nonce[8..23] = 0 * * Primary source of timestamp: additional_data[4..7] (if provided by WebRTC). * Fallback (Android path where additional_data can be empty): * parse RTP header from frame and take timestamp from frame[4..7]. * * If RTP header is found inside frame, we leave header bytes unencrypted * and encrypt only payload (desktop-compatible). */ int Encrypt(cricket::MediaType media_type, uint32_t ssrc, rtc::ArrayView additional_data, rtc::ArrayView frame, rtc::ArrayView encrypted_frame, size_t* bytes_written) override { if (frame.size() == 0 || encrypted_frame.size() < frame.size()) { *bytes_written = 0; return -1; } size_t header_size = 0; bool nonce_from_rtp_header = false; bool nonce_from_generated_ts = false; bool nonce_from_additional_data = false; bool additional_was_rtp_header = false; bool additional_used_mono_offset = false; uint32_t generated_ts_used = 0; // Build nonce from RTP timestamp in additional_data (preferred). uint8_t nonce[24] = {0}; bool additional_used_relative_ts = false; nonce_from_additional_data = fill_nonce_from_additional_data( additional_data.data(), additional_data.size(), nonce, &additional_was_rtp_header, nullptr, &additional_used_relative_ts); if (!nonce_from_additional_data) { nonce_from_rtp_header = fill_nonce_from_rtp_frame(frame.data(), frame.size(), &rtp_probe_, nonce, &header_size); if (!nonce_from_rtp_header) { if (!generated_ts_.initialized) { generated_ts_.initialized = true; generated_ts_.next_timestamp = 0; generated_ts_.next_step = 960; } nonce_from_generated_ts = true; generated_ts_used = generated_ts_.next_timestamp; fill_nonce_from_ts32(generated_ts_used, nonce); diag_event("ENC fallback=generated-ts mt=%s ssrc=%u frame_sz=%zu ad_sz=%zu gen_ts=%u\n", media_type_name(media_type), ssrc, frame.size(), additional_data.size(), generated_ts_used); } } // Some Android sender pipelines expose stream-relative ad8 timestamps // (0, 960, 1920, ...), while desktop receiver expects an absolute base. // For interop, add a monotonic 48k offset once when first ad8 is tiny. if (nonce_from_additional_data && additional_data.size() == 8 && !additional_was_rtp_header && additional_data.data() != nullptr) { const uint64_t ad_ts64 = load64_be(additional_data.data()); if (!sender_ts_offset_.initialized) { sender_ts_offset_.initialized = true; // Keep pure raw-abs mode by default; desktop is the source of truth. sender_ts_offset_.enabled = false; sender_ts_offset_.offset = 0ULL; diag_event("ENC ad8-base init ssrc=%u ad_ts=%llu use_mono=%d mono_off=%llu\n", ssrc, (unsigned long long)ad_ts64, sender_ts_offset_.enabled ? 1 : 0, (unsigned long long)sender_ts_offset_.offset); } if (sender_ts_offset_.enabled) { const uint64_t ts_adj = ad_ts64 + sender_ts_offset_.offset; fill_nonce_from_ts64(ts_adj, nonce); additional_used_mono_offset = true; } } // Desktop createEncodedStreams encrypts full encoded chunk. // To stay wire-compatible, do not preserve any leading RTP-like bytes. rosetta_xchacha20_xor(encrypted_frame.data(), frame.data(), frame.size(), nonce, key_); *bytes_written = frame.size(); if (nonce_from_generated_ts) { const uint32_t step = infer_opus_packet_duration_samples(frame.data(), frame.size()); generated_ts_.next_step = step; generated_ts_.next_timestamp = generated_ts_used + step; } // Diag: log first frames with enough context for crash analysis. int n = diag_count_.fetch_add(1, std::memory_order_relaxed); if (n < kDiagFrameLimit) { uint8_t ad_prefix[8] = {0}; const size_t ad_copy = additional_data.size() < sizeof(ad_prefix) ? additional_data.size() : sizeof(ad_prefix); if (ad_copy > 0) memcpy(ad_prefix, additional_data.data(), ad_copy); ParsedRtpPacket rtp{}; const bool has_rtp = parse_rtp_packet(frame.data(), frame.size(), &rtp); const bool opus_plausible = has_rtp && rtp.header_size < frame.size() ? is_plausible_opus_packet(frame.data() + rtp.header_size, frame.size() - rtp.header_size) : is_plausible_opus_packet(frame.data(), frame.size()); const uint32_t in_hash = fnv1a32(frame.data(), frame.size(), kFrameHashSampleBytes); const uint32_t out_hash = fnv1a32(encrypted_frame.data(), frame.size(), kFrameHashSampleBytes); const char* mode = nonce_from_rtp_header ? "rtp" : (nonce_from_generated_ts ? "gen" : (nonce_from_additional_data ? (additional_was_rtp_header ? "ad-rtp" : (additional_used_mono_offset ? "raw-abs+mono" : (additional_used_relative_ts ? "raw-rel" : "raw-abs"))) : "raw-abs")); LOGI("ENC frame#%d mt=%s ssrc=%u sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u rtp_ok=%d rtp_seq=%u rtp_ts=%u rtp_ssrc=%u opus_ok=%d key_fp=%08x in_h=%08x out_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x", n, media_type_name(media_type), ssrc, frame.size(), additional_data.size(), header_size, mode, nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step, has_rtp ? 1 : 0, has_rtp ? rtp.sequence : 0, has_rtp ? rtp.timestamp : 0, has_rtp ? rtp.ssrc : 0, opus_plausible ? 1 : 0, key_fingerprint_, in_hash, out_hash, ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3], ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]); diag_write("ENC frame#%d mt=%s ssrc=%u sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u rtp_ok=%d rtp_seq=%u rtp_ts=%u rtp_ssrc=%u opus_ok=%d key_fp=%08x in_h=%08x out_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x\n", n, media_type_name(media_type), ssrc, frame.size(), additional_data.size(), header_size, mode, nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step, has_rtp ? 1 : 0, has_rtp ? rtp.sequence : 0, has_rtp ? rtp.timestamp : 0, has_rtp ? rtp.ssrc : 0, opus_plausible ? 1 : 0, key_fingerprint_, in_hash, out_hash, ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3], ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]); } return 0; } size_t GetMaxCiphertextByteSize(cricket::MediaType, size_t frame_size) override { return frame_size; } protected: ~XChaCha20Encryptor() override { diag_event("ENC destroy ptr=%p key_fp=%08x\n", this, key_fingerprint_); memset(key_, 0, 32); } private: mutable std::atomic ref_{0}; mutable std::atomic diag_count_{0}; mutable RtpProbeState rtp_probe_; mutable GeneratedTsState generated_ts_; mutable SenderTsOffsetState sender_ts_offset_; uint32_t key_fingerprint_ = 0; uint8_t key_[32]; }; /* ════════════════════════════════════════════════════════════════ * XChaCha20 Decryptor — inherits from the REAL interface * ════════════════════════════════════════════════════════════════ */ class XChaCha20Decryptor final : public webrtc::FrameDecryptorInterface { public: explicit XChaCha20Decryptor(const uint8_t key[32]) { memcpy(key_, key, 32); key_fingerprint_ = key_fingerprint32(key_); LOGI("DEC init ptr=%p key_fp=%08x", this, key_fingerprint_); diag_event("DEC init ptr=%p key_fp=%08x\n", this, key_fingerprint_); } uint32_t KeyFingerprint() const { return key_fingerprint_; } int FrameCount() const { return diag_count_.load(std::memory_order_relaxed); } uint32_t BadStreak() const { return bad_audio_streak_.load(std::memory_order_relaxed); } /* ── RefCountInterface ─────────────────────────────────────── */ void AddRef() const override { ref_.fetch_add(1, std::memory_order_relaxed); } rtc::RefCountReleaseStatus Release() const override { if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) { delete this; return rtc::RefCountReleaseStatus::kDroppedLastRef; } return rtc::RefCountReleaseStatus::kOtherRefsRemained; } /** * Desktop-compatible decrypt: * - nonce from RTP timestamp * - if RTP header is present inside encrypted_frame (fallback path), * keep header bytes untouched and decrypt payload only. */ Result Decrypt(cricket::MediaType media_type, const std::vector& csrcs, rtc::ArrayView additional_data, rtc::ArrayView encrypted_frame, rtc::ArrayView frame) override { uint8_t nonce[24] = {0}; size_t header_size = 0; bool nonce_from_rtp_header = false; bool nonce_from_generated_ts = false; bool nonce_from_additional_data = false; bool additional_was_rtp_header = false; bool additional_used_relative_ts = false; bool used_additional_relative_fallback = false; bool used_plain_passthrough = false; uint32_t generated_ts_used = 0; nonce_from_additional_data = fill_nonce_from_additional_data( additional_data.data(), additional_data.size(), nonce, &additional_was_rtp_header, nullptr, &additional_used_relative_ts); if (!nonce_from_additional_data) { nonce_from_rtp_header = fill_nonce_from_rtp_frame(encrypted_frame.data(), encrypted_frame.size(), &rtp_probe_, nonce, &header_size); if (!nonce_from_rtp_header) { if (!generated_ts_.initialized) { generated_ts_.initialized = true; generated_ts_.next_timestamp = 0; generated_ts_.next_step = 960; } nonce_from_generated_ts = true; generated_ts_used = generated_ts_.next_timestamp; fill_nonce_from_ts32(generated_ts_used, nonce); diag_event("DEC fallback=generated-ts mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu gen_ts=%u\n", media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), generated_ts_used); } } if (encrypted_frame.size() == 0 || frame.size() < encrypted_frame.size()) { return {Result::Status::kFailedToDecrypt, 0}; } bool used_generated_resync = false; // Desktop createEncodedStreams decrypts full encoded chunk. rosetta_xchacha20_xor(frame.data(), encrypted_frame.data(), encrypted_frame.size(), nonce, key_); if (nonce_from_additional_data) { bool plausible = is_plausible_decrypted_audio_frame(frame.data(), encrypted_frame.size()); // Fallback for Android pipelines where additional_data timestamps are // stream-relative while remote side uses a different absolute base. if (!plausible && additional_data.size() == 8 && !additional_was_rtp_header) { uint8_t nonce_rel[24] = {0}; bool rel_used = false; if (fill_nonce_from_additional_data( additional_data.data(), additional_data.size(), nonce_rel, nullptr, &additional_rel_ts_state_, &rel_used) && rel_used) { std::vector candidate(encrypted_frame.size()); rosetta_xchacha20_xor( candidate.data(), encrypted_frame.data(), encrypted_frame.size(), nonce_rel, key_); if (is_plausible_decrypted_audio_frame(candidate.data(), candidate.size())) { memcpy(frame.data(), candidate.data(), candidate.size()); memcpy(nonce, nonce_rel, sizeof(nonce)); plausible = true; used_additional_relative_fallback = true; additional_used_relative_ts = true; diag_event("DEC fallback=relative-ad-ts mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu nonce_ts=%u\n", media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), nonce_ts32(nonce)); } } } // If payload already looks like valid Opus, keep plaintext. // This protects interop when peer stream is unexpectedly unencrypted. if (!plausible && is_plausible_decrypted_audio_frame(encrypted_frame.data(), encrypted_frame.size())) { memcpy(frame.data(), encrypted_frame.data(), encrypted_frame.size()); used_plain_passthrough = true; diag_event("DEC fallback=plain-passthrough mt=%s csrcs=%zu enc_sz=%zu ad_sz=%zu\n", media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size()); } } if (nonce_from_generated_ts) { bool plausible = is_plausible_opus_packet(frame.data(), encrypted_frame.size()); // Recover after lost packets by probing a few forward timestamp steps. if (!plausible) { std::vector candidate(encrypted_frame.size()); for (uint32_t i = 1; i <= 8; ++i) { const uint32_t ts_try = generated_ts_used + generated_ts_.next_step * i; uint8_t nonce_try[24] = {0}; fill_nonce_from_ts32(ts_try, nonce_try); rosetta_xchacha20_xor( candidate.data(), encrypted_frame.data(), encrypted_frame.size(), nonce_try, key_); if (is_plausible_opus_packet(candidate.data(), candidate.size())) { memcpy(frame.data(), candidate.data(), candidate.size()); generated_ts_used = ts_try; used_generated_resync = true; plausible = true; diag_event("DEC fallback=generated-resync mt=%s csrcs=%zu enc_sz=%zu ts_try=%u step=%u probe=%u\n", media_type_name(media_type), csrcs.size(), encrypted_frame.size(), ts_try, generated_ts_.next_step, i); break; } } } const uint32_t step = infer_opus_packet_duration_samples(frame.data(), encrypted_frame.size()); generated_ts_.next_step = step; generated_ts_.next_timestamp = generated_ts_used + step; } // Diag: log first frames with enough context for crash analysis. int n = diag_count_.fetch_add(1, std::memory_order_relaxed); if (n < kDiagFrameLimit) { uint8_t ad_prefix[8] = {0}; const size_t ad_copy = additional_data.size() < sizeof(ad_prefix) ? additional_data.size() : sizeof(ad_prefix); if (ad_copy > 0) memcpy(ad_prefix, additional_data.data(), ad_copy); ParsedRtpPacket enc_rtp{}; ParsedRtpPacket dec_rtp{}; const bool has_enc_rtp = parse_rtp_packet(encrypted_frame.data(), encrypted_frame.size(), &enc_rtp); const bool has_dec_rtp = parse_rtp_packet(frame.data(), encrypted_frame.size(), &dec_rtp); const bool dec_plausible = is_plausible_decrypted_audio_frame(frame.data(), encrypted_frame.size()); const uint32_t enc_hash = fnv1a32(encrypted_frame.data(), encrypted_frame.size(), kFrameHashSampleBytes); const uint32_t dec_hash = fnv1a32(frame.data(), encrypted_frame.size(), kFrameHashSampleBytes); const char* mode = nullptr; if (nonce_from_rtp_header) { mode = "rtp"; } else if (nonce_from_generated_ts) { mode = used_generated_resync ? "gen-resync" : "gen"; } else if (nonce_from_additional_data) { if (used_plain_passthrough) mode = "raw-plain"; else if (additional_was_rtp_header) mode = "ad-rtp"; else if (used_additional_relative_fallback) mode = "raw-rel-fb"; else mode = additional_used_relative_ts ? "raw-rel" : "raw-abs"; } else { mode = "raw-abs"; } uint32_t bad_streak = 0; if (!dec_plausible) { bad_streak = bad_audio_streak_.fetch_add(1, std::memory_order_relaxed) + 1; if (bad_streak == 1 || bad_streak == 3 || bad_streak == 10 || (bad_streak % 50) == 0) { diag_event("DEC degraded mt=%s csrcs=%zu mode=%s bad_streak=%u nonce_ts=%u key_fp=%08x\n", media_type_name(media_type), csrcs.size(), mode, bad_streak, nonce_ts32(nonce), key_fingerprint_); } } else { const uint32_t prev_bad = bad_audio_streak_.exchange(0, std::memory_order_relaxed); if (prev_bad >= 3) { diag_event("DEC recovered mt=%s csrcs=%zu mode=%s prev_bad_streak=%u nonce_ts=%u key_fp=%08x\n", media_type_name(media_type), csrcs.size(), mode, prev_bad, nonce_ts32(nonce), key_fingerprint_); } } LOGI("DEC frame#%d mt=%s csrcs=%zu enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u dec_ok=%d bad_streak=%u enc_rtp=%d enc_seq=%u enc_ts=%u enc_ssrc=%u dec_rtp=%d dec_seq=%u dec_ts=%u dec_ssrc=%u key_fp=%08x enc_h=%08x dec_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x", n, media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), header_size, mode, nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step, dec_plausible ? 1 : 0, bad_streak, has_enc_rtp ? 1 : 0, has_enc_rtp ? enc_rtp.sequence : 0, has_enc_rtp ? enc_rtp.timestamp : 0, has_enc_rtp ? enc_rtp.ssrc : 0, has_dec_rtp ? 1 : 0, has_dec_rtp ? dec_rtp.sequence : 0, has_dec_rtp ? dec_rtp.timestamp : 0, has_dec_rtp ? dec_rtp.ssrc : 0, key_fingerprint_, enc_hash, dec_hash, ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3], ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]); diag_write("DEC frame#%d mt=%s csrcs=%zu enc_sz=%zu ad=%zu hdr=%zu mode=%s nonce_ts=%u gen_ts=%u next_step=%u dec_ok=%d bad_streak=%u enc_rtp=%d enc_seq=%u enc_ts=%u enc_ssrc=%u dec_rtp=%d dec_seq=%u dec_ts=%u dec_ssrc=%u key_fp=%08x enc_h=%08x dec_h=%08x ad8=%02x%02x%02x%02x%02x%02x%02x%02x\n", n, media_type_name(media_type), csrcs.size(), encrypted_frame.size(), additional_data.size(), header_size, mode, nonce_ts32(nonce), generated_ts_used, generated_ts_.next_step, dec_plausible ? 1 : 0, bad_streak, has_enc_rtp ? 1 : 0, has_enc_rtp ? enc_rtp.sequence : 0, has_enc_rtp ? enc_rtp.timestamp : 0, has_enc_rtp ? enc_rtp.ssrc : 0, has_dec_rtp ? 1 : 0, has_dec_rtp ? dec_rtp.sequence : 0, has_dec_rtp ? dec_rtp.timestamp : 0, has_dec_rtp ? dec_rtp.ssrc : 0, key_fingerprint_, enc_hash, dec_hash, ad_prefix[0], ad_prefix[1], ad_prefix[2], ad_prefix[3], ad_prefix[4], ad_prefix[5], ad_prefix[6], ad_prefix[7]); } return {Result::Status::kOk, encrypted_frame.size()}; } size_t GetMaxPlaintextByteSize(cricket::MediaType, size_t encrypted_frame_size) override { return encrypted_frame_size; } protected: ~XChaCha20Decryptor() override { diag_event("DEC destroy ptr=%p key_fp=%08x bad_streak=%u\n", this, key_fingerprint_, bad_audio_streak_.load(std::memory_order_relaxed)); memset(key_, 0, 32); } private: mutable std::atomic ref_{0}; mutable std::atomic diag_count_{0}; mutable std::atomic bad_audio_streak_{0}; mutable RtpProbeState rtp_probe_; mutable GeneratedTsState generated_ts_; mutable AdditionalTsState additional_rel_ts_state_; uint32_t key_fingerprint_ = 0; uint8_t key_[32]; }; /* ════════════════════════════════════════════════════════════════ * JNI exports * ════════════════════════════════════════════════════════════════ */ extern "C" { /* ── HSalsa20 for nacl.box.before() ──────────────────────────── */ JNIEXPORT jbyteArray JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeHSalsa20( JNIEnv *env, jclass, jbyteArray jRawDh) { jsize len = env->GetArrayLength(jRawDh); if (len < 32) return nullptr; auto *raw = (uint8_t *)env->GetByteArrayElements(jRawDh, nullptr); uint8_t out[32]; uint8_t zeros[16] = {0}; rosetta_hsalsa20(out, zeros, raw); env->ReleaseByteArrayElements(jRawDh, (jbyte *)raw, JNI_ABORT); jbyteArray result = env->NewByteArray(32); env->SetByteArrayRegion(result, 0, 32, (jbyte *)out); memset(out, 0, 32); return result; } /* ── Create / destroy encryptor ──────────────────────────────── */ JNIEXPORT jlong JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCreateEncryptor( JNIEnv *env, jclass, jbyteArray jKey) { jsize len = env->GetArrayLength(jKey); if (len < 32) { LOGE("Create encryptor failed: key length=%d (<32)", (int)len); diag_event("ENC create-failed key_len=%d\n", (int)len); return 0; } auto *key = (uint8_t *)env->GetByteArrayElements(jKey, nullptr); const uint32_t key_fp = key_fingerprint32(key); auto *enc = new XChaCha20Encryptor(key); env->ReleaseByteArrayElements(jKey, (jbyte *)key, JNI_ABORT); // AddRef so the pointer we hand out has ref=1. // WebRTC's scoped_refptr will AddRef again when it takes ownership. enc->AddRef(); LOGI("Created XChaCha20 encryptor %p key_fp=%08x", enc, key_fp); diag_event("ENC created ptr=%p key_fp=%08x key_len=%d\n", enc, key_fp, (int)len); return reinterpret_cast(enc); } JNIEXPORT void JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeReleaseEncryptor( JNIEnv *, jclass, jlong ptr) { if (ptr == 0) return; auto *enc = reinterpret_cast(ptr); LOGI("Release XChaCha20 encryptor %p key_fp=%08x", enc, enc->KeyFingerprint()); diag_event("ENC release ptr=%p key_fp=%08x\n", enc, enc->KeyFingerprint()); enc->Release(); } /* ── Create / destroy decryptor ──────────────────────────────── */ JNIEXPORT jlong JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCreateDecryptor( JNIEnv *env, jclass, jbyteArray jKey) { jsize len = env->GetArrayLength(jKey); if (len < 32) { LOGE("Create decryptor failed: key length=%d (<32)", (int)len); diag_event("DEC create-failed key_len=%d\n", (int)len); return 0; } auto *key = (uint8_t *)env->GetByteArrayElements(jKey, nullptr); const uint32_t key_fp = key_fingerprint32(key); auto *dec = new XChaCha20Decryptor(key); env->ReleaseByteArrayElements(jKey, (jbyte *)key, JNI_ABORT); dec->AddRef(); LOGI("Created XChaCha20 decryptor %p key_fp=%08x", dec, key_fp); diag_event("DEC created ptr=%p key_fp=%08x key_len=%d\n", dec, key_fp, (int)len); return reinterpret_cast(dec); } JNIEXPORT void JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeReleaseDecryptor( JNIEnv *, jclass, jlong ptr) { if (ptr == 0) return; auto *dec = reinterpret_cast(ptr); LOGI("Release XChaCha20 decryptor %p key_fp=%08x", dec, dec->KeyFingerprint()); diag_event("DEC release ptr=%p key_fp=%08x\n", dec, dec->KeyFingerprint()); dec->Release(); } /* ── Install native crash handler ─────────────────────────────── */ JNIEXPORT void JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeInstallCrashHandler( JNIEnv *env, jclass, jstring jPath) { const char *path = env->GetStringUTFChars(jPath, nullptr); strncpy(g_crash_path, path, sizeof(g_crash_path) - 1); env->ReleaseStringUTFChars(jPath, path); struct sigaction sa = {}; sa.sa_sigaction = native_crash_handler; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sigaction(SIGSEGV, &sa, &g_old_sigsegv); sigaction(SIGABRT, &sa, &g_old_sigabrt); LOGI("Native crash handler installed, path=%s", g_crash_path); } /* ── Open diagnostics file for E2EE frame logging ────────────── */ JNIEXPORT void JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeOpenDiagFile( JNIEnv *env, jclass, jstring jPath) { if (g_diag_fd >= 0) { close(g_diag_fd); g_diag_fd = -1; } g_diag_event_count.store(0, std::memory_order_relaxed); const char *path = env->GetStringUTFChars(jPath, nullptr); strncpy(g_diag_path, path, sizeof(g_diag_path) - 1); g_diag_fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); env->ReleaseStringUTFChars(jPath, path); if (g_diag_fd >= 0) { diag_write("=== E2EE DIAGNOSTICS pid=%d ===\n", (int)getpid()); LOGI("Diag file opened: %s", g_diag_path); diag_event("DIAG open path=%s\n", g_diag_path); } else { LOGE("Failed to open diag file: %s", g_diag_path); } } JNIEXPORT void JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeCloseDiagFile( JNIEnv *, jclass) { if (g_diag_fd >= 0) { diag_event("DIAG close path=%s\n", g_diag_path); diag_write("=== END ===\n"); close(g_diag_fd); g_diag_fd = -1; } } /* ── Query frame counts for health checks ────────────────────── */ JNIEXPORT jint JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetEncryptorFrameCount( JNIEnv *, jclass, jlong ptr) { if (ptr == 0) return -1; auto *enc = reinterpret_cast(ptr); return enc->FrameCount(); } JNIEXPORT jint JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetDecryptorFrameCount( JNIEnv *, jclass, jlong ptr) { if (ptr == 0) return -1; auto *dec = reinterpret_cast(ptr); return dec->FrameCount(); } JNIEXPORT jint JNICALL Java_com_rosetta_messenger_network_XChaCha20E2EE_nativeGetDecryptorBadStreak( JNIEnv *, jclass, jlong ptr) { if (ptr == 0) return -1; auto *dec = reinterpret_cast(ptr); return dec->BadStreak(); } } /* extern "C" */