From 47ac3f04b7013457997845bc5d1a86848a86e431 Mon Sep 17 00:00:00 2001 From: Jonathan Lennox Date: Thu, 23 May 2024 16:54:32 -0400 Subject: [PATCH] feat(VideoParser): In some cases, process VP8 or VP9 packets as AV1. (#2134) If the packets don't have the necessary fields to be routed as their payload type, but they do have an AV1 DD, route them based on the AV1 DD instead. --- .../org/jitsi/nlj/rtp/ParsedVideoPacket.kt | 6 ++ .../jitsi/nlj/rtp/codec/VideoCodecParser.kt | 1 - .../jitsi/nlj/rtp/codec/av1/Av1DDPacket.kt | 3 + .../org/jitsi/nlj/rtp/codec/vp8/Vp8Packet.kt | 4 ++ .../org/jitsi/nlj/rtp/codec/vp9/Vp9Packet.kt | 6 ++ .../transform/node/incoming/VideoParser.kt | 62 ++++++++++++++----- 6 files changed, 64 insertions(+), 18 deletions(-) diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/ParsedVideoPacket.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/ParsedVideoPacket.kt index d4928295b8..5614e6f445 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/ParsedVideoPacket.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/ParsedVideoPacket.kt @@ -31,4 +31,10 @@ abstract class ParsedVideoPacket( abstract val isKeyframe: Boolean abstract val isStartOfFrame: Boolean abstract val isEndOfFrame: Boolean + + /** Whether the packet meets the needs of the routing infrastructure. + * If a packet could be parsed more than one way (e.g. it is VP8 or VP9 but also has an AV1 DD) + * this will let us choose which parse to prefer. + */ + abstract fun meetsRoutingNeeds(): Boolean } diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/VideoCodecParser.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/VideoCodecParser.kt index 623bb516d4..0f83e7e8ab 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/VideoCodecParser.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/VideoCodecParser.kt @@ -19,7 +19,6 @@ package org.jitsi.nlj.rtp.codec import org.jitsi.nlj.MediaSourceDesc import org.jitsi.nlj.PacketInfo import org.jitsi.nlj.RtpEncodingDesc -import org.jitsi.nlj.findRtpLayerDescs import org.jitsi.nlj.rtp.VideoRtpPacket /** diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/av1/Av1DDPacket.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/av1/Av1DDPacket.kt index 476beca682..dbb83f0d58 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/av1/Av1DDPacket.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/av1/Av1DDPacket.kt @@ -98,6 +98,9 @@ class Av1DDPacket : ParsedVideoPacket { override val isEndOfFrame: Boolean get() = statelessDescriptor.endOfFrame + override fun meetsRoutingNeeds(): Boolean = + true // If it didn't parse as AV1 we would have failed in the constructor + override val layerIds: Collection get() = frameInfo?.dtisPresent ?: run { super.layerIds } diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp8/Vp8Packet.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp8/Vp8Packet.kt index 33d98323ff..38c2457dad 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp8/Vp8Packet.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp8/Vp8Packet.kt @@ -69,6 +69,10 @@ class Vp8Packet private constructor( /** This uses [get] rather than initialization because [isMarked] is a var. */ get() = isMarked + override fun meetsRoutingNeeds(): Boolean { + return hasPictureId && hasTemporalLayerIndex + } + val hasTemporalLayerIndex = DePacketizer.VP8PayloadDescriptor.hasTemporalLayerIndex(buffer, payloadOffset, payloadLength) diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp9/Vp9Packet.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp9/Vp9Packet.kt index 7baf2379d5..ab05bf5d82 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp9/Vp9Packet.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/rtp/codec/vp9/Vp9Packet.kt @@ -68,6 +68,12 @@ class Vp9Packet private constructor( override val isEndOfFrame: Boolean = isEndOfFrame ?: DePacketizer.VP9PayloadDescriptor.isEndOfFrame(buffer, payloadOffset, payloadLength) + override fun meetsRoutingNeeds(): Boolean { + // Question: should we include hasLayerIndices here? I.e. if we get a VP9 packet with an AV1 DD and + // a VP9 picture ID, but no VP9 layer indices, are we better off parsing it as VP9 or AV1? + return hasPictureId + } + override val layerIds: Collection get() = if (hasLayerIndices) { listOf(RtpLayerDesc.getIndex(0, spatialLayerIndex, temporalLayerIndex)) diff --git a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/transform/node/incoming/VideoParser.kt b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/transform/node/incoming/VideoParser.kt index 850e39fa9d..d38b3ffeb7 100644 --- a/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/transform/node/incoming/VideoParser.kt +++ b/jitsi-media-transform/src/main/kotlin/org/jitsi/nlj/transform/node/incoming/VideoParser.kt @@ -22,6 +22,7 @@ import org.jitsi.nlj.SetMediaSourcesEvent import org.jitsi.nlj.findRtpSource import org.jitsi.nlj.format.Vp8PayloadType import org.jitsi.nlj.format.Vp9PayloadType +import org.jitsi.nlj.rtp.ParsedVideoPacket import org.jitsi.nlj.rtp.RtpExtensionType import org.jitsi.nlj.rtp.codec.VideoCodecParser import org.jitsi.nlj.rtp.codec.av1.Av1DDParser @@ -76,25 +77,17 @@ class VideoParser( val parsedPacket = try { when { payloadType is Vp8PayloadType -> { - val vp8Packet = packetInfo.packet.toOtherType(::Vp8Packet) - packetInfo.packet = vp8Packet - packetInfo.resetPayloadVerification() - - videoCodecParser = checkParserType(packetInfo) { source -> + val (vp8Packet, parser) = parseNormalPayload(packetInfo, ::Vp8Packet) { source -> Vp8Parser(source, logger) } - + videoCodecParser = parser vp8Packet } payloadType is Vp9PayloadType -> { - val vp9Packet = packetInfo.packet.toOtherType(::Vp9Packet) - packetInfo.packet = vp9Packet - packetInfo.resetPayloadVerification() - - videoCodecParser = checkParserType(packetInfo) { source -> + val (vp9Packet, parser) = parseNormalPayload(packetInfo, ::Vp9Packet) { source -> Vp9Parser(source, logger) } - + videoCodecParser = parser vp9Packet } av1DDExtId != null && packet.getHeaderExtension(av1DDExtId) != null -> { @@ -123,7 +116,6 @@ class VideoParser( } } packetInfo.layeringChanged = true - videoCodecParser = null } return packetInfo } @@ -143,17 +135,53 @@ class VideoParser( * so the count is correct. */ /* Alternately we could keep track of keyframes we've already seen, by timestamp, but that seems unnecessary. */ if (parsedPacket != null && parsedPacket.isKeyframe && parsedPacket.isStartOfFrame) { - logger.cdebug { "Received a keyframe for ssrc ${packet.ssrc} ${packet.sequenceNumber}" } + logger.cdebug { "Received a keyframe for ssrc ${packet.ssrc} at seq ${packet.sequenceNumber}" } stats.numKeyframes++ } if (packetInfo.layeringChanged) { - logger.cdebug { "Layering structure changed for ssrc ${packet.ssrc} ${packet.sequenceNumber}" } + logger.cdebug { "Layering structure changed for ssrc ${packet.ssrc} at seq ${packet.sequenceNumber}" } stats.numLayeringChanges++ } return packetInfo } + /** A normal payload is one where we choose the subclass of the ParsedVideoPacket and VideoCodecParser + * based on the payload type, as opposed to the header extension (like AV1). If the packet doesn't + * satisfy [ParsedVideoPacket.meetsRoutingNeeds] but it has an AV1 DD header extension, we will parse + * this packet as AV1 rather than as its normal type. + * */ + private inline fun parseNormalPayload( + packetInfo: PacketInfo, + otherTypeCreator: (ByteArray, Int, Int) -> ParsedVideoPacket, + parserConstructor: (MediaSourceDesc) -> T + ): Pair { + val parsedPacket = packetInfo.packet.toOtherType(otherTypeCreator) + if (!parsedPacket.meetsRoutingNeeds()) { + // See if we can parse this packet as AV1 + val packet = packetInfo.packetAs() + val av1DDExtId = this.av1DDExtId // So null checks work + if (av1DDExtId != null && packet.getHeaderExtension(av1DDExtId) != null) { + val parser = checkParserType(packetInfo) { source -> + Av1DDParser(source, logger, diagnosticContext) + } + + val av1DDPacket = parser?.createFrom(packet, av1DDExtId)?.also { + packetInfo.packet = it + packetInfo.resetPayloadVerification() + } + + return Pair(av1DDPacket, parser) + } + } + packetInfo.packet = parsedPacket + packetInfo.resetPayloadVerification() + + val parser = checkParserType(packetInfo, parserConstructor) + + return Pair(parsedPacket, parser) + } + private inline fun checkParserType( packetInfo: PacketInfo, constructor: (MediaSourceDesc) -> T @@ -168,8 +196,8 @@ class VideoParser( ?: // VideoQualityLayerLookup will drop this packet later, so no need to warn about it now return null logger.cdebug { - "Creating new ${T::class.java} for source ${source.sourceName}, " + - "current videoCodecParser is ${parser?.javaClass}" + "Creating new ${T::class.java.simpleName} for source ${source.sourceName}, " + + "current videoCodecParser is ${parser?.javaClass?.simpleName}" } resetSource(source) packetInfo.layeringChanged = true