diff --git a/v/src/Bundles.scala b/v/src/Bundles.scala index 2d58c185a..b3aaa87df 100644 --- a/v/src/Bundles.scala +++ b/v/src/Bundles.scala @@ -156,46 +156,6 @@ class LaneRequest(param: LaneParameter) extends Bundle { // vmacc 的vd需要跨lane读 TODO: move to [[V]] def ma: Bool = decodeResult(Decoder.multiplier) && decodeResult(Decoder.uop)(1, 0).xorR && !decodeResult(Decoder.vwmacc) - - // TODO: move to Module - def initState: InstGroupState = { - val res: InstGroupState = Wire(new InstGroupState(param)) - val sCrossRead = !decodeResult(Decoder.crossRead) - val sCrossWrite = !decodeResult(Decoder.crossWrite) - // decode的时候需要注意有些bit操作的指令虽然不需要读vs1,但是需要读v0 - res.sRead1 := !decodeResult(Decoder.vtype) - res.sRead2 := false.B - res.sReadVD := decodeResult(Decoder.sReadVD) - res.wCrossReadLSB := sCrossRead - res.wCrossReadMSB := sCrossRead - res.wResponseFeedback := decodeResult(Decoder.scheduler) - res.sSendResponse := decodeResult(Decoder.scheduler) - res.sExecute := decodeResult(Decoder.dontNeedExecuteInLane) - res.wExecuteRes := decodeResult(Decoder.dontNeedExecuteInLane) - res.sWrite := decodeResult(Decoder.sWrite) - res.sCrossWriteLSB := sCrossWrite - res.sCrossWriteMSB := sCrossWrite - res.sSendCrossReadResultLSB := sCrossRead - res.sSendCrossReadResultMSB := sCrossRead - res.sCrossReadLSB := sCrossRead - res.sCrossReadMSB := sCrossRead - res - } - - // TODO: move to Module - // TODO: remove instType - def instType: UInt = { - VecInit( - Seq( - decodeResult(Decoder.logic) && !decodeResult(Decoder.other), - decodeResult(Decoder.adder) && !decodeResult(Decoder.other), - decodeResult(Decoder.shift) && !decodeResult(Decoder.other), - decodeResult(Decoder.multiplier) && !decodeResult(Decoder.other), - decodeResult(Decoder.divider) && !decodeResult(Decoder.other), - decodeResult(Decoder.other) - ) - ).asUInt - } } class InstGroupState(param: LaneParameter) extends Bundle { @@ -264,16 +224,16 @@ class InstructionControlRecord(param: LaneParameter) extends Bundle { /** Store request from [[V]]. */ val laneRequest: LaneRequest = new LaneRequest(param) - /** State machine of the current instruction. */ - val state: InstGroupState = new InstGroupState(param) - /** csr follows the instruction. * TODO: move to [[laneRequest]] */ val csr: CSRInterface = new CSRInterface(param.vlMaxBits) - /** which group in the slot is executing. */ - val groupCounter: UInt = UInt(param.groupNumberBits.W) + /** which group is the last group for instruction. */ + val lastGroupForInstruction: UInt = UInt(param.groupNumberBits.W) + + /** this is the last lane for mask type instruction */ + val isLastLaneForMaskLogic: Bool = Bool() /** the find first one instruction is finished by other lanes, * for example, sbf(set before first) @@ -632,3 +592,32 @@ class LSURequest(dataWidth: Int) extends Bundle { */ val instructionIndex: UInt = UInt(3.W) } + +// queue bundle for execute stage +class LaneExecuteStage(parameter: LaneParameter)(isLastSlot: Boolean) extends Bundle { + // which group for this execution + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + + // mask for this execute group + val mask: UInt = UInt(4.W) + + /** Store some data that will be used later. e.g: + * ffo Write VRF By OtherLanes: What should be written into vrf if ffo end by other lanes. pipe from s0 + * read result of vs2, for instructions that are not executed, pipe from s1 + */ + val pipeData: Option[UInt] = Option.when(isLastSlot)(UInt(parameter.datapathWidth.W)) + /** pipe vd for ffo */ + val pipeVD: Option[UInt] = Option.when(isLastSlot)(UInt(parameter.datapathWidth.W)) +} + +// Record of temporary execution units +class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends Bundle { + val crossReadVS2: Bool = Bool() + val bordersForMaskLogic: Bool = Bool() + val mask: UInt = UInt(4.W) + val executeIndex: UInt = UInt(2.W) + val source: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) + val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + /** groupCounter need use to update `Lane.maskFormatResultForGroup` */ + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) +} diff --git a/v/src/LSU.scala b/v/src/LSU.scala index aa5f6988c..620e74a4a 100644 --- a/v/src/LSU.scala +++ b/v/src/LSU.scala @@ -281,7 +281,8 @@ class LSU(param: LSUParam) extends Module { val selectedIdleMSHR: UInt = ffo(idleMSHRs)(param.lsuMSHRSize - 1, 0) reqEnq := VecInit(Mux(request.valid, selectedIdleMSHR, 0.U).asBools) // todo: address conflict - request.ready := idleMSHRs.orR + // todo: Execute only a single lsu instruction first + request.ready := idleMSHRs.andR Seq.tabulate(param.laneNumber) { laneID => // LSU slots read VRF request arbitration diff --git a/v/src/Lane.scala b/v/src/Lane.scala index 9d045cd5c..0091d6e6f 100644 --- a/v/src/Lane.scala +++ b/v/src/Lane.scala @@ -101,11 +101,16 @@ case class LaneParameter( val maskGroupWidth: Int = datapathWidth /** see [[VParameter.maskGroupSize]] */ - val maskGroupSize: Int = vLen / datapathWidth + val maskGroupSize: Int = vLen / maskGroupWidth /** hardware width of [[maskGroupSize]]. */ val maskGroupSizeBits: Int = log2Ceil(maskGroupSize) + /** Size of the queue for storing execution information + * todo: Determined by the longest execution unit + * */ + val executionQueueSize: Int = 2 + /** Parameter for [[VRF]] */ def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize) @@ -229,43 +234,22 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** the slot is occupied by instruction */ val slotOccupied: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(false.B))) - /** read from VRF vs1 for VFU */ - val source1: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.datapathWidth.W)))) - - /** read from VRF vs2 for VFU */ - val source2: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.datapathWidth.W)))) - - /** read from VRF rd for VFU */ - val source3: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.datapathWidth.W)))) - - /** execution result, write to VRF, - * or goes to [[V]] for complex instructions like reduce - */ - val executionResult: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.datapathWidth.W)))) - // 跨lane写的额外用寄存器存储执行的结果和mask - - // wait data for EEW = 2*SEW - // TODO: do we need to switch to remote waiting? - /** cross write LSB data to send out to other lanes. */ - val crossWriteDataLSBHalf: UInt = RegInit(0.U(parameter.datapathWidth.W)) - - /** cross write LSB mask to send out to other lanes. */ - val crossWriteMaskLSBHalf: UInt = RegInit(0.U((parameter.dataPathByteWidth / 2).W)) - - /** cross write MSB data to send out to other lanes. */ - val crossWriteDataMSBHalf: UInt = RegInit(0.U(parameter.datapathWidth.W)) + /** mask group count for slot */ + val maskGroupCountVec: Vec[UInt] = + RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.maskGroupSizeBits.W)))) - /** cross write MSB mask to send out to other lanes. */ - val crossWriteMaskMSBHalf: UInt = RegInit(0.U((parameter.dataPathByteWidth / 2).W)) + /** mask index for slot */ + val maskIndexVec: Vec[UInt] = + RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(log2Ceil(parameter.maskGroupWidth).W)))) - /** result of mask format, it need to re-arrange in Scheduler. */ - val maskFormatResult: UInt = RegInit(0.U(parameter.datapathWidth.W)) + /** pipe state for slot */ + val pipeFinishVec: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(false.B))) /** the find first one index register in this lane. */ val ffoIndexReg: UInt = RegInit(0.U(log2Ceil(parameter.vLen / 8).W)) /** result of reduce instruction. */ - val reduceResult: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(parameter.datapathWidth.W)))) + val reduceResult: UInt = RegInit(0.U(parameter.datapathWidth.W)) /** arbiter for VRF write * 1 for [[vrfWriteChannel]] @@ -368,6 +352,11 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ val slotActive: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + /** When the slot wants to move, + * you need to stall the pipeline first and wait for the pipeline to be cleared. + */ + val slotShiftValid: Bool = Wire(Bool()) + /** The slots start to shift in these rules: * - instruction can only enqueue to the last slot. * - all slots can only shift at the same time which means: @@ -377,6 +366,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ val slotCanShift: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + /** Which data group is waiting for the result of the cross-lane read */ + val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) + /** cross lane reading port from [[readBusPort]] * if [[ReadBusData.sinkIndex]] matches the index of this lane, dequeue from ring */ @@ -406,15 +398,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** for each slot, it occupies which VFU. */ val instructionTypeVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.executeUnitNum.W))) - /** a instruction is finished in the corresponding slot. */ - val instructionExecuteFinished: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) - - /** Instructions that read across lane will have an extra set of reads if vl is not aligned. */ - val instructionCrossReadFinished: Bool = Wire(Bool()) - - /** valid for requesting mask unit. */ - val maskRequestValids: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) - /** request for logic instruction type. */ val logicRequests: Vec[MaskedLogicRequest] = Wire( Vec(parameter.chainingSize, new MaskedLogicRequest(parameter.datapathWidth)) @@ -447,9 +430,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val otherResponse: OtherUnitResp = Wire(Output(new OtherUnitResp(parameter.datapathWidth))) - /** request for mask instruction type. */ - val maskRequests: Vec[LaneResponse] = Wire(Vec(parameter.chainingSize, Output(new LaneResponse(parameter)))) - /** assert when a instruction is finished in the slot. */ val instructionFinishedVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W))) @@ -484,1094 +464,556 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) - // Slot logic ~1.2k lines slotControl.zipWithIndex.foreach { case (record, index) => - /** The execution is finished - * - all execution got its result. - * - VRF written is finished. - * - not coupling to cross lane read/write. - */ - val executeFinish = - // TODO: why we need `sExecute`? - record.state.sExecute && - record.state.wExecuteRes && - record.state.sWrite - - /** cross write is done. */ - val crossWriteFinish: Bool = record.state.sCrossWriteLSB && record.state.sCrossWriteMSB - - /** send cross read result is done. */ - val sendCrossReadResultFinish: Bool = record.state.sSendCrossReadResultLSB && record.state.sSendCrossReadResultMSB - - /** transaction between [[Lane]] and [[V]] is done. */ - val schedulerFinish: Bool = record.state.wResponseFeedback && record.state.sSendResponse + val decodeResult: DecodeBundle = record.laneRequest.decodeResult + val isLastSlot: Boolean = index == 0 - /** need mask from [[V]]. */ - val needMaskSource: Bool = record.laneRequest.mask - - /** all read from VRF is done. - * it need additional one cycle to indicate there is no read in VRF since the read latency of VRF is 1. + /** We will ignore the effect of mask since: + * [[Decoder.crossRead]]: We need to read data to another lane + * [[Decoder.scheduler]]: We need to synchronize with [[V]] every group + * [[record.laneRequest.loadStore]]: We need to read data to lsu every group */ - val readVrfRequestFinish: Bool = record.state.sReadVD && record.state.sRead1 && record.state.sRead2 + val alwaysNextGroup: Bool = decodeResult(Decoder.crossRead) || decodeResult(Decoder.nr) || + !decodeResult(Decoder.scheduler) || record.laneRequest.loadStore - /** for non-masked instruction, always ready, - * for masked instruction, need to wait for mask - */ - val maskReady: Bool = record.mask.valid || !needMaskSource + // mask not use for mask element + val maskNotMaskedElement = !record.laneRequest.mask || + record.laneRequest.decodeResult(Decoder.maskSource) || + record.laneRequest.decodeResult(Decoder.maskLogic) /** onehot value of SEW. */ - val vSew1H: UInt = UIntToOH(record.csr.vSew) - - /** the current index of execution. */ - val elementIndex: UInt = Mux1H( - vSew1H(2, 0), - Seq( - // SEW = 8 - (record.groupCounter ## record.executeIndex)(4, 0), - // SEW = 16 - (record.groupCounter ## record.executeIndex(1))(4, 0), - // SEW = 32 - record.groupCounter - ) - ) - - /** mask for current element. */ - val maskBits: Bool = record.mask.bits(elementIndex(parameter.datapathWidthBits - 1, 0)) - - /** mask bit which will be sent to execution unit.(e.g. input of `adc`) */ - val maskAsInput: Bool = maskBits && (record.laneRequest.decodeResult(Decoder.maskSource) || - record.laneRequest.decodeResult(Decoder.gather)) + val vSew1H: UInt = UIntToOH(record.csr.vSew)(2, 0) /** if asserted, the element won't be executed. * adc: vm = 0; madc: vm = 0 -> s0 + s1 + c, vm = 1 -> s0 + s1 */ val skipEnable: Bool = record.laneRequest.mask && !record.laneRequest.decodeResult(Decoder.maskSource) && - !record.laneRequest.decodeResult(Decoder.maskLogic) - - /** This element is skipped due to skipEnable and corresponding mask is 0. */ - val masked: Bool = skipEnable && !maskBits - - // find the next unmasked element. - /** - * TODO: fixme logic here. - */ - val maskForExecutionGroup: UInt = Mux1H( - Seq(skipEnable && record.mask.valid, !skipEnable), - Seq(record.mask.bits, (-1.S(parameter.datapathWidth.W)).asUInt) + !record.laneRequest.decodeResult(Decoder.maskLogic) && + !alwaysNextGroup + + // mask for current mask group + val maskForMaskGroup: UInt = Mux( + skipEnable, + record.mask.bits, + (-1.S(parameter.datapathWidth.W)).asUInt ) - /** the current 1H of the executing element in the mask group. */ - val current1H = UIntToOH(elementIndex(4, 0)) + // register for s0 enqueue, it will move with the slot + // 'maskGroupCountVec' 'maskIndexVec' 'pipeFinishVec' - /** the next element to execute in the group. */ - val next1H = - ffo((scanLeftOr(current1H) ## false.B) & maskForExecutionGroup)(parameter.datapathWidth - 1, 0) + // pipe clear + val pipeClear: Bool = Wire(Bool()) - /** the index to write to VRF in [[parameter.dataPathByteWidth]]. - * for long latency pipe, the index will follow the pipeline. - */ - val writeIndex = Mux( - record.laneRequest.decodeResult(Decoder.divider), - divWriteIndex, - record.executeIndex - ) + if (isLastSlot) { + slotActive(index) := slotOccupied(index) && !pipeFinishVec(index) + } else { + slotActive(index) := slotOccupied(index) && !pipeFinishVec(index) && !slotShiftValid && + !(decodeResult(Decoder.crossRead) || decodeResult(Decoder.crossWrite)) && + decodeResult(Decoder.scheduler) + } - /** VRF byte level mask */ - val writeMaskInByte = Mux1H( - vSew1H(2, 0), - Seq( - // TODO: move UIntToOH out - UIntToOH(writeIndex), - writeIndex(1) ## writeIndex(1) ## !writeIndex(1) ## !writeIndex(1), - "b1111".U(4.W) - ) - ) + if(isLastSlot) { slotCanShift(index) := true.B } else { slotCanShift(index) := pipeClear } + + // --- stage 0 start --- + // todo: parameter register width for all stage + + // register for stage0 + val valid0: Bool = RegInit(false.B) + val groupCounterInStage0: UInt = RegInit(0.U(parameter.groupNumberBits.W)) + val maskInStage0: UInt = RegInit(0.U(4.W)) + val sSendResponseInStage0: Option[Bool] = Option.when(isLastSlot) {RegInit(true.B)} + + val s0Valid: Bool = Wire(Bool()) + val s0Ready: Bool = Wire(Bool()) + val s0Fire: Bool = s0Valid && s0Ready + + /** Filter by different sew */ + val filterVec: Seq[(Bool, UInt)] = Seq(0, 1, 2).map { filterSew => + // The lower 'dataGroupIndexSize' bits represent the offsets in the data group + val dataGroupIndexSize: Int = 2 - filterSew + // each group has '2 ** dataGroupIndexSize' elements + val dataGroupSize = 1 << dataGroupIndexSize + // The data group index of last data group + val groupIndex = (maskIndexVec(index) >> dataGroupIndexSize).asUInt + // Filtering data groups + val groupFilter: UInt = scanLeftOr(UIntToOH(groupIndex)) ## false.B + // Whether there are element in the data group that have not been masked + // TODO: use 'record.maskGroupedOrR' & update it + val maskForDataGroup: UInt = + VecInit(maskForMaskGroup.asBools.grouped(dataGroupSize).map(_.reduce(_ || _)).toSeq).asUInt + val groupFilterByMask = maskForDataGroup & groupFilter + // ffo next group + val nextDataGroupOH: UInt = ffo(groupFilterByMask) + // This mask group has the next data group to execute + val hasNextDataGroup = nextDataGroupOH.orR + val nextElementBaseIndex: UInt = (OHToUInt(nextDataGroupOH) << dataGroupIndexSize).asUInt + (hasNextDataGroup, nextElementBaseIndex) + } - /** VRF bit level mask */ - val writeMaskInBit: UInt = FillInterleaved(8, writeMaskInByte) + /** is there any data left in this group? */ + val nextOrR: Bool = Mux1H(vSew1H, filterVec.map(_._1)) - /** output of execution unit need to align to VRF in bit level(used in dynamic shift) - * TODO: fix me - */ - val dataOffset: UInt = writeIndex ## 0.U(3.W) + // mask is exhausted + val maskExhausted: Bool = !nextOrR - /** is there any data left in this group? */ - val nextOrR: Bool = next1H.orR + /** The index of next element in this mask group.(0-31) */ + val nextIndex: UInt = Mux(decodeResult(Decoder.maskLogic), 0.U, Mux1H(vSew1H, filterVec.map(_._2))) - /** the index of next element in this group.(0-31) */ - val nextIndex: UInt = OHToUInt(next1H) + /** The mask group will be updated */ + val maskGroupWillUpdate: Bool = decodeResult(Decoder.maskLogic) || maskExhausted - /** a mask group can represent 8 execution groups. - * - * the [[groupCounter]] being all `1` in corresponding SEW indicates - * this is the last execution group of the mask group. - * - * when assert, mask group should be updated in next cycle. - */ - val lastGroupForMask = Mux1H( - vSew1H(2, 0), + /** next mask group */ + val nextMaskGroupCount: UInt = maskGroupCountVec(index) + maskGroupWillUpdate + + /** The index of next execute element in whole instruction */ + val elementIndexForInstruction = maskGroupCountVec(index) ## Mux1H( + vSew1H, Seq( - record.groupCounter(log2Ceil(parameter.maskGroupWidth / 4) - 1, 0).andR, - record.groupCounter(log2Ceil(parameter.maskGroupWidth / 2) - 1, 0).andR, - record.groupCounter(log2Ceil(parameter.maskGroupWidth) - 1, 0).andR + maskIndexVec(index)(parameter.datapathWidthBits - 1, 2) ## laneIndex ## maskIndexVec(index)(1, 0), + maskIndexVec(index)(parameter.datapathWidthBits - 1, 1) ## laneIndex ## maskIndexVec(index)(0), + maskIndexVec(index) ## laneIndex ) ) - /** need to update mask when - * - the last execution group in the mask group - * - the instruction is finished - */ - val canUpdateVRFMaskFormat: Bool = lastGroupForMask || instructionExecuteFinished(index) - - /** if the instruction type is `maskDestination`, for each lanes, - * the result of execution unit should be updated to VRF in mask format, - * this is a bit level cross lane access, which cannot use the cross lane channel, - * thus we use [[LaneResponse]] to update to [[V]] - * and [[V]] should regroup it and use [[vrfWriteChannel]] to send to each [[Lane]] - */ - val maskTypeDestinationWriteReady: Bool = - !record.laneRequest.decodeResult(Decoder.maskDestination) || canUpdateVRFMaskFormat - - /** the instruction type in the slot is a reduce type. */ - val reduceType: Bool = record.laneRequest.decodeResult(Decoder.red) - /** the instruction type in the slot is a readOnly type. */ - val readOnly: Bool = record.laneRequest.decodeResult(Decoder.readOnly) + /** The next element is out of execution range */ + val outOfExecutionRange = Mux( + decodeResult(Decoder.maskLogic), + (maskGroupCountVec(index) > record.lastGroupForInstruction), + elementIndexForInstruction >= record.csr.vl + ) || record.instructionFinished - /** reduce should send feedback to at the last cycle of the execution group. */ - val reduceReady = !reduceType || instructionExecuteFinished(index) + // todo: 如果这一部分时序不够,可以放到下一级去, 然后在下一级 kill nr类型的 + /** Encoding of different element lengths: 1, 8, 16, 32 */ + val elementLengthOH = Mux(decodeResult(Decoder.maskLogic), 1.U, vSew1H(2, 0) ## false.B) - /** [[Lane]] need response from [[V]] for some instructions. - * TODO: doc - * 正常来说scheduler都需要回应,只是 mask destination 和 reduce 需要有先决条件 - */ - val needResponse: Bool = !record.laneRequest.decodeResult(Decoder.scheduler) && - maskTypeDestinationWriteReady && reduceReady && slotActive(index) - - // CSR - /** if `vl = N`, the last element index is `N-1`, for each `x`, is a bit inside `vl` - * - * xxxxx xxx xx -> vsew = 0 (1 element -> 8 bits) - * xxxxxx xxx x -> vsew = 1 (1 element -> 16 bits) - * xxxxxxx xxx -> vsew = 2 (1 element -> 32 bits) - * | | | - * execute index - * lane index - * group index - * - * TODO: we truncate the vl from TOP, do we need - 1 in [[V]]? - * take care of `vl = 0` - */ - val lastElementIndex: UInt = (record.csr.vl - 1.U)(parameter.vlMaxBits - 2, 0) - - /** For an instruction, the last group is not executed by all lanes, - * here is the index of last group of the instruction - */ - val lastGroupIndex: UInt = Mux1H( - vSew1H(2, 0), + /** Which group of data will be accessed */ + val dataGroupIndex: UInt = Mux1H( + elementLengthOH, Seq( - lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits + 2), - lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits + 1), - lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits) + maskGroupCountVec(index), + maskGroupCountVec(index) ## maskIndexVec(index)(4, 2), + maskGroupCountVec(index) ## maskIndexVec(index)(4, 1), + maskGroupCountVec(index) ## maskIndexVec(index) ) ) - /** the lane that the last element locate */ - val lastLaneIndex: UInt = Mux1H( - vSew1H(2, 0), - Seq( - lastElementIndex(parameter.laneNumberBits + 2 - 1, 2), - lastElementIndex(parameter.laneNumberBits + 1 - 1, 1), - lastElementIndex(parameter.laneNumberBits - 1, 0) - ) - ) - - /** Used to calculate the last group of mask, which will only be effective when [[isEndLane]] */ - val lastElementExecuteIndex: UInt = Mux1H( - vSew1H(1, 0), - Seq( - lastElementIndex(1, 0), - lastElementIndex(0) ## false.B + /** Calculate the mask of the request that is in s0 */ + val maskEnqueueWireInStage0: UInt = (record.mask.bits >> maskIndexVec(index)).asUInt(3, 0) + + val isTheLastGroup = dataGroupIndex === record.lastGroupForInstruction + // update register in s0 + when(s0Fire) { + maskGroupCountVec(index) := nextMaskGroupCount + // todo: handle all elements in first group are masked + maskIndexVec(index) := nextIndex + groupCounterInStage0 := dataGroupIndex + maskInStage0 := maskEnqueueWireInStage0 + sSendResponseInStage0.foreach(state => + state := + !(record.laneRequest.loadStore || + decodeResult(Decoder.readOnly) || + (decodeResult(Decoder.red) && isTheLastGroup) || + (decodeResult(Decoder.maskDestination) && (maskGroupWillUpdate || isTheLastGroup)) || + decodeResult(Decoder.ffo)) ) - ) - - /** The relative position of the last lane determines the processing of the last group. */ - val lanePositionLargerThanEndLane: Bool = laneIndex > lastLaneIndex - - /** This lane is the last group. */ - val isEndLane: Bool = laneIndex === lastLaneIndex - - /** for this lane, which group is the last group. */ - val lastGroupForLane: UInt = lastGroupIndex - lanePositionLargerThanEndLane + } - /** when [[InstructionControlRecord.executeIndex]] reaches [[slotGroupFinishedIndex]], the group in the slot is finished. - * 00 -> 11 - * 01 -> 10 - * 10 -> 00 - * - * TODO: 64bit - */ - val slotGroupFinishedIndex: UInt = !record.csr.vSew(1) ## !record.csr.vSew.orR - - /** mask logic is in bit level granularity: - * - * xxx xxx xxxxx(1 element -> 1 bits) - * | | | - * execute index - * lane index - * group index - * TODO: better name. - */ - val vlTail: UInt = record.csr.vl(parameter.datapathWidthBits - 1, 0) + // Handshake for s0 + s0Valid := slotActive(index) && !outOfExecutionRange && (record.mask.valid || !record.laneRequest.mask) - /** lane index in [[vlTail]] - * TODO: better name. - */ - val vlBody: UInt = - record.csr.vl(parameter.datapathWidthBits + parameter.laneNumberBits - 1, parameter.datapathWidthBits) - - /** group index in [[vlTail]] - * TODO: better name. - */ - val vlHead: UInt = record.csr.vl(parameter.vlMaxBits - 1, parameter.datapathWidthBits + parameter.laneNumberBits) + when(!pipeFinishVec(index) && outOfExecutionRange) { + pipeFinishVec(index) := true.B + } - /** use mask to fix the case that `vl` is not in the multiple of [[parameter.datapathWidth]]. - * it will fill the LSB of mask to `0`, mask it to not execute those elements. - */ - val lastGroupMask = scanRightOr(UIntToOH(vlTail)) >> 1 + instructionFinishedVec(index) := 0.U + when(slotOccupied(index) && pipeClear && pipeFinishVec(index)) { + slotOccupied(index) := false.B + instructionFinishedVec(index) := UIntToOH( + record.laneRequest.instructionIndex(parameter.instructionIndexBits - 2, 0) + ) + } - /** `vl` is not in the multiple of [[parameter.datapathWidth]]. */ - val dataPathMisaligned = vlTail.orR + // update mask todo: handle maskRequestFireOH + slotMaskRequestVec(index).valid := maskExhausted && record.laneRequest.mask && (s0Fire || !record.mask.valid) + slotMaskRequestVec(index).bits := nextMaskGroupCount + // There are new masks + val maskUpdateFire: Bool = slotMaskRequestVec(index).valid && maskRequestFireOH(index) + // The old mask is used up + val maskFailure: Bool = maskExhausted && s0Fire + // update mask register + when(maskUpdateFire) { + record.mask.bits := maskInput + } + when(maskUpdateFire ^ maskFailure) { + record.mask.valid := maskUpdateFire + } - /** how many groups will be executed for the instruction. - * if [[dataPathMisaligned]], the last group will be executed. - */ - val maskLogicGroupCount = (vlHead ## vlBody) - !dataPathMisaligned + // --- stage 0 end & stage 1_0 start --- - /** which lane should execute the last group. */ - val lastLaneIndexForMaskLogic: UInt = maskLogicGroupCount(parameter.laneNumberBits - 1, 0) + // stage 1_0 reg + val valid1: Bool = RegInit(false.B) - /** should this lane execute the last group? */ - val isLastLaneForMaskLogic: Bool = lastLaneIndexForMaskLogic === laneIndex + /** schedule read src1 */ + val sRead0 = RegInit(true.B) - /** TODO: bug? */ - val lastGroupCountForMaskLogic: UInt = maskLogicGroupCount >> parameter.laneNumberBits + /** schedule read src2 */ + val sRead1 = RegInit(true.B) - /** for mask logic, the group is the last group. */ - val lastGroupForMaskLogic: Bool = lastGroupCountForMaskLogic === record.groupCounter + /** schedule read vd */ + val sRead2 = RegInit(true.B) - /** mask logic will complete at the end of group. - * TODO: move `&& record.laneRequest.decodeResult(Decoder.maskLogic)` to [[lastGroupForMaskLogic]] - */ - val maskLogicWillCompleted: Bool = lastGroupForMaskLogic && record.laneRequest.decodeResult(Decoder.maskLogic) + // pipe from stage0 + val groupCounterInStage1: UInt = RegInit(0.U(parameter.groupNumberBits.W)) - /** the last element is inside this group. */ - val bordersForMaskLogic: Bool = lastGroupForMaskLogic && isLastLaneForMaskLogic && - dataPathMisaligned && record.laneRequest.decodeResult(Decoder.maskLogic) + // mask for group pipe from stage0 + val maskInStage1: UInt = RegInit(0.U(4.W)) + val maskForFilterInStage1: UInt = FillInterleaved(4, maskNotMaskedElement) | maskInStage1 - /** if [[bordersForMaskLogic]], use [[lastGroupMask]] to mask the result otherwise use [[fullMask]]. */ - val maskCorrect = Mux(bordersForMaskLogic, lastGroupMask, fullMask) + // read result register + val readResult0: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val readResult1: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val readResult2: UInt = RegInit(0.U(parameter.datapathWidth.W)) - /** no need to waif for [[laneResponseFeedback]] */ - val noFeedBack: Bool = !(readOnly || record.laneRequest.loadStore) + /** schedule cross lane read LSB.(access VRF for cross read) */ + val sCrossReadLSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** TODO: move to [[V]]. */ - val nr = record.laneRequest.decodeResult(Decoder.nr) + /** schedule cross lane read MSB.(access VRF for cross read) */ + val sCrossReadMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** the execution unit need more than one cycle, - * divider. - */ - val longLatency: Bool = instructionTypeVec(index)(4) + /** schedule send cross lane read LSB result. */ + val sSendCrossReadResultLSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** the long latency execution unit is masked. */ - val maskedLongLatency: Bool = masked && longLatency - if (index != 0) { - // read only - val decodeResult: DecodeBundle = record.laneRequest.decodeResult - val needCrossRead = decodeResult(Decoder.crossRead) - val needCrossWrite = decodeResult(Decoder.crossWrite) + /** schedule send cross lane read MSB result. */ + val sSendCrossReadResultMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** select from VFU, send to [[executionResult]], [[crossWriteDataLSBHalf]], [[crossWriteDataMSBHalf]]. */ - val dataDequeue: UInt = Mux1H(instructionTypeVec(index), executeDequeueData) + /** wait for cross lane read LSB result. */ + val wCrossReadLSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** fire of [[dataDequeue]] */ - val dataDequeueFire: Bool = (instructionTypeVec(index) & executeDequeueFire).orR + /** wait for cross lane read MSB result. */ + val wCrossReadMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - /** [[record.groupCounter]] & [[record.executeIndex]] is used as index of current data sending to VFU. - * By default, data is not masked. due the the logic in [[next1H]], it is not masked. - * when updating [[record.mask.bits]], the pointer is updated to the first item of mask group. - * but this element might be masked. - * So firstMasked is used to take care of this. - */ - /*val firstMasked: Bool = Wire(Bool())*/ - // TODO: move this to verification module - when(needCrossRead) { - assert(record.csr.vSew != 2.U) - } - slotActive(index) := - // slot should alive - slotOccupied(index) && - // head should alive, if not, the slot should shift to make head alive - slotOccupied.head && - // cross lane instruction should execute in the first slot - !record.laneRequest.decodeResult(Decoder.specialSlot) && - // mask should ready for masked instruction - maskReady - - // wait read result - val readNext0 = RegNext(vrfReadRequest(index)(0).fire) - val readNext1 = RegNext(vrfReadRequest(index)(1).fire) - val readNext2 = RegNext(vrfReadRequest(index)(2).fire) - // shift slot // !record.state.sExecute - slotCanShift(index) := !((readNext0 || readNext1 || readNext2) && slotOccupied(index)) - - // vs1 read - vrfReadRequest(index)(0).valid := !record.state.sRead1 && slotActive(index) - vrfReadRequest(index)(0).bits.offset := record.groupCounter(parameter.vrfOffsetBits - 1, 0) - // todo: when vlmul > 0 use ## rather than + - vrfReadRequest(index)(0).bits.vs := record.laneRequest.vs1 + record.groupCounter( - parameter.groupNumberBits - 1, - parameter.vrfOffsetBits - ) - // used for hazard detection - vrfReadRequest(index)(0).bits.instructionIndex := record.laneRequest.instructionIndex - - // vs2 read - vrfReadRequest(index)(1).valid := !record.state.sRead2 && slotActive(index) - vrfReadRequest(index)(1).bits.offset := record.groupCounter(parameter.vrfOffsetBits - 1, 0) - // todo: when vlmul > 0 use ## rather than + - // TODO: pull Mux to standalone signal - vrfReadRequest(index)(1).bits.vs := record.laneRequest.vs2 + - record.groupCounter(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) - vrfReadRequest(index)(1).bits.instructionIndex := record.laneRequest.instructionIndex + // next for update cross read register + val sReadNext0: Bool = RegNext(sRead0, false.B) + val sReadNext1: Bool = RegNext(sRead1, false.B) + val sReadNext2: Bool = RegNext(sRead2, false.B) + val sCrossReadLSBNext: Option[Bool] = sCrossReadLSB.map(RegNext(_, false.B)) + val sCrossReadMSBNext: Option[Bool] = sCrossReadMSB.map(RegNext(_, false.B)) + // All read requests sent + val sReadFinish: Bool = sRead0 && sRead1 && sRead2 + // Waiting to read the response + val sReadFinishNext: Bool = sReadNext0 && sReadNext1 && sReadNext2 + // 'sReadFinishNext' may assert at the next cycle of 's1Fire', need sReadFinish + val readFinish: Bool = sReadFinish && sReadFinishNext + val stage1Finish: Bool = (Seq(readFinish) ++ sSendCrossReadResultLSB ++ + sSendCrossReadResultMSB ++ wCrossReadLSB ++ wCrossReadMSB).reduce(_ && _) - // vd read - vrfReadRequest(index)(2).valid := !record.state.sReadVD && slotActive(index) - vrfReadRequest(index)(2).bits.offset := record.groupCounter(parameter.vrfOffsetBits - 1, 0) - vrfReadRequest(index)(2).bits.vs := record.laneRequest.vd + - record.groupCounter(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) - // for hazard detection - vrfReadRequest(index)(2).bits.instructionIndex := record.laneRequest.instructionIndex + // control wire + val s1Valid = valid0 + val s1Ready = Wire(Bool()) + val s1Fire = s1Valid && s1Ready + val sSendResponseInStage1 = Option.when(isLastSlot)(RegEnable(sSendResponseInStage0.get, true.B, s1Fire)) - /** all read operation is finished. */ - val readFinish = RegNext(readVrfRequestFinish) && readVrfRequestFinish + when(s1Fire ^ s0Fire) { valid0 := s0Fire } + s0Ready := s1Ready || !valid0 - // state machine control - when(vrfReadRequest(index)(0).fire) { - record.state.sRead1 := true.B - } - when(readNext0) { - source1(index) := vrfReadResult(index)(0) - } - when(vrfReadRequest(index)(1).fire) { - record.state.sRead2 := true.B - } - when(readNext1) { - source2(index) := vrfReadResult(index)(1) - } - when(vrfReadRequest(index)(2).fire) { - record.state.sReadVD := true.B - source3(index) := vrfReadResult(index)(2) - } - when(readNext2) { - source3(index) := vrfReadResult(index)(2) - } - - /** 这一组的mask已经没有剩余了 */ - val maskNeedUpdate = !nextOrR - val nextGroupCountMSB: UInt = Mux1H( - vSew1H(1, 0), - Seq( - record.groupCounter(parameter.groupNumberBits - 1, parameter.groupNumberBits - 3), - false.B ## record.groupCounter(parameter.groupNumberBits - 1, parameter.groupNumberBits - 2) - ) - ) + maskNeedUpdate - val indexInLane = nextGroupCountMSB ## nextIndex - // csrInterface.vSew 只会取值0, 1, 2,需要特别处理移位 - val nextIntermediateVolume = (indexInLane << record.csr.vSew).asUInt - val nextGroupCount = nextIntermediateVolume(parameter.groupNumberBits + 1, 2) - val nextExecuteIndex = nextIntermediateVolume(1, 0) - - /** 虽然没有计算完一组,但是这一组剩余的都被mask去掉了 */ - val maskFilterEnd = skipEnable && (nextGroupCount =/= record.groupCounter) - - /** 需要一个除vl导致的end来决定下一个的 element index 是什么 */ - val dataDepletion = writeIndex === slotGroupFinishedIndex || maskFilterEnd - - /** 这一组计算全完成了 */ - val groupEnd = dataDepletion || instructionExecuteFinished(index) - - /** 计算当前这一组的 vrf mask - * 已知:mask mask1H executeIndex - * sew match: - * 0: - * executeIndex match: - * 0: 0001 - * 1: 0010 - * 2: 0100 - * 3: 1000 - * 1: - * executeIndex(0) match: - * 0: 0011 - * 1: 1100 - * 2: - * 1111 - */ - val executeByteEnable = Mux1H( - vSew1H(2, 0), - Seq( - UIntToOH(record.executeIndex), - record.executeIndex(1) ## record.executeIndex(1) ## !record.executeIndex(1) ## !record.executeIndex(1), - 15.U(4.W) - ) - ) - val executeBitEnable: UInt = FillInterleaved(8, executeByteEnable) - - def CollapseOperand(data: UInt, enable: Bool = true.B, sign: Bool = false.B): UInt = { - val dataMasked: UInt = data & executeBitEnable - val select: UInt = Mux(enable, vSew1H(2, 0), 4.U(3.W)) - // when sew = 0 - val collapse0 = Seq.tabulate(4)(i => dataMasked(8 * i + 7, 8 * i)).reduce(_ | _) - // when sew = 1 - val collapse1 = Seq.tabulate(2)(i => dataMasked(16 * i + 15, 16 * i)).reduce(_ | _) - Mux1H( - select, - Seq( - Fill(25, sign && collapse0(7)) ## collapse0, - Fill(17, sign && collapse1(15)) ## collapse1, - (sign && data(31)) ## data - ) - ) - } - - // 处理操作数 - /** - * src1: src1有 IXV 三种类型,只有V类型的需要移位 - */ - val finalSource1 = CollapseOperand( - Mux(reduceType, reduceResult(index), source1(index)), - decodeResult(Decoder.vtype) && !reduceType, - !decodeResult(Decoder.unsigned0) - ) - - /** source2 一定是V类型的 */ - val finalSource2 = CollapseOperand(source2(index), true.B, !decodeResult(Decoder.unsigned1)) - - /** source3 有两种:adc & ma, c等处理mask的时候再处理 */ - val finalSource3 = CollapseOperand(source3(index)) - - val nextElementIndex = Mux1H( - vSew1H, - Seq( - indexInLane(indexInLane.getWidth - 1, 2) ## laneIndex ## indexInLane(1, 0), - indexInLane(indexInLane.getWidth - 1, 1) ## laneIndex ## indexInLane(0), - indexInLane ## laneIndex - ) - ) - instructionExecuteFinished(index) := nextElementIndex >= record.csr.vl || maskLogicWillCompleted - // 假如这个单元执行的是logic的类型的,请求应该是什么样子的 - val logicRequest = Wire(new MaskedLogicRequest(parameter.datapathWidth)) - logicRequest.src.head := finalSource2 - logicRequest.src(1) := finalSource1 - logicRequest.src(2) := maskCorrect - logicRequest.src(3) := finalSource3 - logicRequest.opcode := decodeResult(Decoder.uop) - // 在手动做Mux1H - logicRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.logic) && !decodeResult(Decoder.other), - logicRequest - ) - - // adder 的 - val adderRequest = Wire(new LaneAdderReq(parameter.datapathWidth)) - adderRequest.src := VecInit(Seq(finalSource1, finalSource2)) - adderRequest.mask := maskAsInput - adderRequest.opcode := decodeResult(Decoder.uop) - adderRequest.sign := !decodeResult(Decoder.unsigned1) - adderRequest.reverse := decodeResult(Decoder.reverse) - adderRequest.average := decodeResult(Decoder.average) - adderRequest.saturate := decodeResult(Decoder.saturate) - adderRequest.vxrm := record.csr.vxrm - adderRequest.vSew := record.csr.vSew - adderRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.adder) && !decodeResult(Decoder.other), - adderRequest - ) - - // shift 的 - val shiftRequest = Wire(new LaneShifterReq(parameter.shifterParameter)) - shiftRequest.src := finalSource2 - shiftRequest.shifterSize := Mux1H( - vSew1H(2, 1), - Seq(false.B ## finalSource1(3), finalSource1(4, 3)) - ) ## finalSource1(2, 0) - shiftRequest.opcode := decodeResult(Decoder.uop) - shiftRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.shift) && !decodeResult(Decoder.other), - shiftRequest - ) - shiftRequest.vxrm := record.csr.vxrm - - // mul - val mulRequest: LaneMulReq = Wire(new LaneMulReq(parameter.mulParam)) - mulRequest.src := VecInit(Seq(finalSource1, finalSource2, finalSource3)) - mulRequest.opcode := decodeResult(Decoder.uop) - mulRequest.saturate := decodeResult(Decoder.saturate) - mulRequest.vSew := record.csr.vSew - mulRequest.vxrm := record.csr.vxrm - multiplerRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.multiplier) && !decodeResult(Decoder.other), - mulRequest - ) - - // div - val divRequest = Wire(new LaneDivRequest(parameter.datapathWidth)) - divRequest.src := VecInit(Seq(finalSource1, finalSource2)) - divRequest.rem := decodeResult(Decoder.uop)(0) - divRequest.sign := !decodeResult(Decoder.unsigned0) - divRequest.index := record.executeIndex - dividerRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.divider) && !decodeResult(Decoder.other), - divRequest + /** mask offset for this group, needs to be aligned with data group */ + val maskOffsetForNextGroup: UInt = maskIndexVec(index)(4, 2) ## Mux1H( + vSew1H(2, 0), + Seq( + 0.U(2.W), + maskIndexVec(index)(1) ## false.B, + maskIndexVec(index)(1, 0) ) + ) - // other - // TODO: remove output - val otherRequest: OtherUnitReq = Wire(Output(new OtherUnitReq(parameter))) - otherRequest.src := VecInit(Seq(finalSource1, finalSource2, finalSource3)) - otherRequest.popInit := reduceResult(index) - otherRequest.opcode := decodeResult(Decoder.uop) - otherRequest.imm := record.laneRequest.vs1 - otherRequest.laneIndex := laneIndex - otherRequest.groupIndex := record.groupCounter - otherRequest.executeIndex := record.executeIndex - otherRequest.sign := !decodeResult(Decoder.unsigned0) - otherRequest.mask := maskAsInput || !record.laneRequest.mask - otherRequest.complete := record.ffoByOtherLanes || record.selfCompleted - otherRequest.maskType := record.laneRequest.mask - otherRequest.vSew := record.csr.vSew - otherRequest.vxrm := record.csr.vxrm - otherRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.other), - otherRequest - ) + /** mask for this group */ + val nextMaskForGroup: UInt = (record.mask.bits >> maskOffsetForNextGroup)(3, 0) - // 往scheduler的执行任务compress viota - // TODO: remove output - val maskRequest: LaneResponse = Wire(Output(new LaneResponse(parameter))) - val canSendMaskRequest = needResponse && readFinish && record.state.sExecute - val maskValid = canSendMaskRequest && !record.state.sSendResponse - val noNeedWaitScheduler: Bool = !canSendMaskRequest || decodeResult(Decoder.scheduler) || schedulerFinish - // 往外边发的是原始的数据 - maskRequest.data := Mux( - record.laneRequest.decodeResult(Decoder.maskDestination), - maskFormatResult, - Mux( - reduceType, - reduceResult(index), - source2(index) - ) - ) - maskRequest.toLSU := record.laneRequest.loadStore - maskRequest.instructionIndex := record.laneRequest.instructionIndex - maskRequest.ffoSuccess := record.selfCompleted - maskRequests(index) := maskAnd(slotOccupied(index) && maskValid, maskRequest) - maskRequestValids(index) := maskValid - - when( - laneResponseFeedback.valid && laneResponseFeedback.bits.instructionIndex === record.laneRequest.instructionIndex - ) { - record.state.wResponseFeedback := true.B - } - when(maskValid) { - record.state.sSendResponse := true.B - } - instructionTypeVec(index) := record.laneRequest.instType - executeEnqueueValid(index) := maskAnd(readFinish && !record.state.sExecute, instructionTypeVec(index)) - when((instructionTypeVec(index) & executeEnqueueFire).orR) { - when(groupEnd) { - record.state.sExecute := true.B - }.otherwise { - record.executeIndex := nextExecuteIndex - } - } + // --- stage 1_0 end & stage 1_1 start --- - // todo: 暂时先这样,处理mask的时候需要修 - // TODO: hardware effort is too large since 5 bits dynamic shifting is too large. - val executeResult = (dataDequeue << dataOffset).asUInt(parameter.datapathWidth - 1, 0) - val resultUpdate: UInt = (executeResult & writeMaskInBit) | (executionResult(index) & (~writeMaskInBit).asUInt) - when(dataDequeueFire) { - when(groupEnd) { - record.state.wExecuteRes := true.B - } - executionResult(index) := resultUpdate - record.selfCompleted := otherResponse.ffoSuccess - when(!masked) { - record.vrfWriteMask := record.vrfWriteMask | executeByteEnable - when(reduceType) { - reduceResult(index) := dataDequeue - } - } - } - // 写rf - vrfWriteArbiter(index).valid := - record.state.wExecuteRes && !record.state.sWrite && slotActive(index) && noNeedWaitScheduler - vrfWriteArbiter(index).bits.vd := record.laneRequest.vd + record.groupCounter( + // read port 0 + vrfReadRequest(index)(0).valid := !sRead0 && valid1 + vrfReadRequest(index)(0).bits.offset := groupCounterInStage1(parameter.vrfOffsetBits - 1, 0) + vrfReadRequest(index)(0).bits.vs := Mux( + // encodings with vm=0 are reserved for mask type logic + record.laneRequest.decodeResult(Decoder.maskLogic) && !record.laneRequest.decodeResult(Decoder.logic), + // read v0 for (15. Vector Mask Instructions) + 0.U, + record.laneRequest.vs1 + groupCounterInStage1( parameter.groupNumberBits - 1, parameter.vrfOffsetBits ) - vrfWriteArbiter(index).bits.offset := record.groupCounter - vrfWriteArbiter(index).bits.data := Mux(record.ffoByOtherLanes, 0.U, executionResult(index)) - vrfWriteArbiter(index).bits.last := instructionExecuteFinished(index) - vrfWriteArbiter(index).bits.instructionIndex := record.laneRequest.instructionIndex - vrfWriteArbiter(index).bits.mask := record.vrfWriteMask - when(vrfWriteFire(index)) { - record.state.sWrite := true.B - } - instructionFinishedVec(index) := 0.U(parameter.chainingSize.W) - val maskUnhindered = maskRequestFireOH(index) || !maskNeedUpdate - when((record.state.asUInt.andR && maskUnhindered) || record.instructionFinished) { - when((instructionExecuteFinished(index) && noFeedBack) || record.instructionFinished) { - slotOccupied(index) := false.B - when(slotOccupied(index)) { - instructionFinishedVec(index) := UIntToOH( - record.laneRequest.instructionIndex(parameter.instructionIndexBits - 2, 0) - ) - } - }.otherwise { - - record.state := record.laneRequest.initState - record.groupCounter := nextGroupCount - record.executeIndex := nextExecuteIndex - record.vrfWriteMask := 0.U - when(maskRequestFireOH(index)) { - record.mask.valid := true.B - record.mask.bits := maskInput - record.maskGroupedOrR := maskGroupedOrR - } - } - } - when( - laneResponseFeedback.bits.complete && laneResponseFeedback.bits.instructionIndex === record.laneRequest.instructionIndex - ) { - // 例如:别的lane找到了第一个1 - record.ffoByOtherLanes := true.B - when(decodeResult(Decoder.dontNeedExecuteInLane)) { - slotOccupied(index) := false.B - } - } - // mask 更换 - slotMaskRequestVec(index).valid := maskNeedUpdate - slotMaskRequestVec(index).bits := nextGroupCountMSB - } else { // slotNumber == 0 - val decodeResult: DecodeBundle = record.laneRequest.decodeResult - val needCrossRead = decodeResult(Decoder.crossRead) - - /** cross read has two case - * - read vs2 - * - read vd: only vwmacc - */ - val crossReadVS2: Bool = needCrossRead && !decodeResult(Decoder.vwmacc) - - /** State machine may jump through the group if the mask is all 0. - * For these case it cannot jump through: - * [[needCrossRead]]: although we may not need execution unit if it is masked, - * we still need to access VRF for another lane - * [[record.laneRequest.special]]: We need to synchronize with [[V]] every group - * TODO: uarch about the synchronization - * [[nr]] will ignore mask - */ - val alwaysNextGroup: Bool = needCrossRead || record.laneRequest.special || nr - - /** select from VFU, send to [[executionResult]], [[crossWriteDataLSBHalf]], [[crossWriteDataMSBHalf]]. */ - val dataDequeue: UInt = Mux1H(instructionTypeVec(index), executeDequeueData) - - /** fire of [[dataDequeue]] */ - val dataDequeueFire: Bool = (instructionTypeVec(index) & executeDequeueFire).orR - - // TODO: move this to verification module - when(needCrossRead) { - assert(record.csr.vSew != 2.U) - } - - // TODO: for index = 0, slotOccupied(index) === slotOccupied.head - slotActive(index) := - // slot should alive - slotOccupied(index) && - // head should alive, if not, the slot should shift to make head alive - slotOccupied.head && - // mask should ready for masked instruction - maskReady - - // shift slot - slotCanShift(index) := !(record.state.sExecute && slotOccupied(index)) - - // vs1 read - vrfReadRequest(index)(0).valid := !record.state.sRead1 && slotActive(index) - vrfReadRequest(index)(0).bits.offset := record.groupCounter(parameter.vrfOffsetBits - 1, 0) - vrfReadRequest(index)(0).bits.vs := Mux( - record.laneRequest.decodeResult(Decoder.maskLogic) && - !record.laneRequest.decodeResult(Decoder.logic), - // read v0 for (15. Vector Mask Instructions) - 0.U, - // todo: when vlmul > 0 use ## rather than + - record.laneRequest.vs1 + record.groupCounter( - parameter.groupNumberBits - 1, - parameter.vrfOffsetBits - ) - ) - // used for hazard detection - vrfReadRequest(index)(0).bits.instructionIndex := record.laneRequest.instructionIndex + ) + // used for hazard detection + vrfReadRequest(index)(0).bits.instructionIndex := record.laneRequest.instructionIndex - // vs2 read - vrfReadRequest(index)(1).valid := !(record.state.sRead2 && record.state.sCrossReadLSB) && slotActive(index) + // read port 1 + if (isLastSlot) { + vrfReadRequest(index)(1).valid := !(sRead1 && sCrossReadLSB.get) && valid1 vrfReadRequest(index)(1).bits.offset := Mux( - record.state.sRead2, + sRead1, // cross lane LSB - record.groupCounter(parameter.vrfOffsetBits - 2, 0) ## false.B, + groupCounterInStage1(parameter.vrfOffsetBits - 2, 0) ## false.B, // normal read - record.groupCounter(parameter.vrfOffsetBits - 1, 0) + groupCounterInStage1(parameter.vrfOffsetBits - 1, 0) ) - // todo: when vlmul > 0 use ## rather than + - // TODO: pull Mux to standalone signal vrfReadRequest(index)(1).bits.vs := Mux( - record.laneRequest.decodeResult(Decoder.vwmacc) && record.state.sRead2, + decodeResult(Decoder.vwmacc) && sRead1, // cross read vd for vwmacc, since it need dual [[dataPathWidth]], use vs2 port to read LSB part of it. record.laneRequest.vd, // read vs2 for other instruction record.laneRequest.vs2 ) + Mux( - record.state.sRead2, + sRead1, // cross lane - record.groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1), + groupCounterInStage1(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1), // no cross lane - record.groupCounter(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) + groupCounterInStage1(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) ) - vrfReadRequest(index)(1).bits.instructionIndex := record.laneRequest.instructionIndex + } else { + vrfReadRequest(index)(1).valid := !sRead1 && valid1 + vrfReadRequest(index)(1).bits.offset := groupCounterInStage1(parameter.vrfOffsetBits - 1, 0) + vrfReadRequest(index)(1).bits.vs := record.laneRequest.vs2 + + groupCounterInStage1(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) + } + vrfReadRequest(index)(1).bits.instructionIndex := record.laneRequest.instructionIndex - // vd read - vrfReadRequest(index)(2).valid := !(record.state.sReadVD && record.state.sCrossReadMSB) && slotActive(index) + // read port 2 + if (isLastSlot) { + vrfReadRequest(index)(2).valid := !(sRead2 && sCrossReadMSB.get) && valid1 vrfReadRequest(index)(2).bits.offset := Mux( - record.state.sReadVD, + sRead2, // cross lane MSB - record.groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B, + groupCounterInStage1(parameter.vrfOffsetBits - 2, 0) ## true.B, // normal read - record.groupCounter(parameter.vrfOffsetBits - 1, 0) + groupCounterInStage1(parameter.vrfOffsetBits - 1, 0) ) vrfReadRequest(index)(2).bits.vs := Mux( - record.state.sReadVD && !record.laneRequest.decodeResult(Decoder.vwmacc), + sRead2 && !record.laneRequest.decodeResult(Decoder.vwmacc), // cross lane access use vs2 record.laneRequest.vs2, // normal read vd or cross read vd for vwmacc record.laneRequest.vd ) + Mux( - record.state.sReadVD, - record.groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1), - record.groupCounter(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) + sRead2, + groupCounterInStage1(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1), + groupCounterInStage1(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) ) - // for hazard detection - vrfReadRequest(index)(2).bits.instructionIndex := record.laneRequest.instructionIndex - - /** all read operation is finished. - * - * notice: this doesn't include [[crossReadVRFOnly]] - */ - val readFinish = - // wait one cycle for VRF read latency. - RegNext(readVrfRequestFinish, false.B) && - // RegNext(readVrfRequestFinish) is not initialized, to avoid the invalid value of last group, - // fanout readVrfRequestFinish directly. - readVrfRequestFinish && - // wait for cross lane read result - record.state.wCrossReadLSB && - record.state.wCrossReadMSB && - record.state.sCrossReadLSB && - record.state.sCrossReadMSB - - // state machine control + } else { + vrfReadRequest(index)(2).valid := !sRead2 && valid1 + vrfReadRequest(index)(2).bits.offset := groupCounterInStage1(parameter.vrfOffsetBits - 1, 0) + vrfReadRequest(index)(2).bits.vs := record.laneRequest.vd + + groupCounterInStage1(parameter.groupNumberBits - 1, parameter.vrfOffsetBits) + } + vrfReadRequest(index)(2).bits.instructionIndex := record.laneRequest.instructionIndex + + val readPortFire0: Bool = vrfReadRequest(index)(0).fire + val readPortFire1: Bool = vrfReadRequest(index)(1).fire + val readPortFire2: Bool = vrfReadRequest(index)(2).fire + // reg next for update result + val readPortFireNext0: Bool = RegNext(readPortFire0, false.B) + val readPortFireNext1: Bool = RegNext(readPortFire1, false.B) + val readPortFireNext2: Bool = RegNext(readPortFire2, false.B) + + // update read control register in stage 1 + when(s1Fire) { + // init register by decode result + sRead0 := !decodeResult(Decoder.vtype) + // todo: gather only read vs1? + sRead1 := false.B + sRead2 := decodeResult(Decoder.sReadVD) + val sCrossRead = !decodeResult(Decoder.crossRead) + ( + sCrossReadLSB ++ sCrossReadMSB ++ + sSendCrossReadResultLSB ++ sSendCrossReadResultMSB ++ + wCrossReadLSB ++ wCrossReadMSB + ).foreach(state => state := sCrossRead) + + // pipe reg from stage 0 + groupCounterInStage1 := groupCounterInStage0 + maskInStage1 := maskInStage0 + }.otherwise { // change state machine when read source1 - when(vrfReadRequest(index)(0).fire) { - record.state.sRead1 := true.B + when(readPortFire0) { + sRead0 := true.B } - - // read result from source1 need read latency - when(RegNext(vrfReadRequest(index)(0).fire)) { - // todo: datapath Mux - source1(index) := vrfReadResult(index)(0) + // the priority of `sRead1` is higher than `sCrossReadLSB` + when(readPortFire1) { + sRead1 := true.B + sCrossReadLSB.foreach(d => d := sRead1) } - - // the priority of `sRead2` is higher than `sCrossReadLSB` - when(vrfReadRequest(index)(1).fire) { - record.state.sRead2 := true.B - when(record.state.sRead2) { - record.state.sCrossReadLSB := true.B - } + // the priority of `sRead2` is higher than `sCrossReadMSB` + when(readPortFire2) { + sRead2 := true.B + sCrossReadMSB.foreach(d => d := sRead2) } - // read result from source2 need read latency - when(RegNext(vrfReadRequest(index)(1).fire)) { - when(RegNext(record.state.sRead2)) { - crossReadLSBOut := vrfReadResult(index)(1) + when(readBusDequeue.valid) { + when(readBusDequeue.bits.isTail) { + wCrossReadMSB.foreach(_ := true.B) }.otherwise { - source2(index) := vrfReadResult(index)(1) + wCrossReadLSB.foreach(_ := true.B) } } + } - // the priority of `sReadVD` is higher than `sCrossReadMSB` - when(vrfReadRequest(index)(2).fire) { - record.state.sReadVD := true.B - when(record.state.sReadVD) { - record.state.sCrossReadMSB := true.B + // update read result register + when(readPortFireNext0) { + readResult0 := vrfReadResult(index)(0) + } + + when(readPortFireNext1) { + if (isLastSlot) { + when(sReadNext1) { + crossReadLSBOut := vrfReadResult(index)(1) + }.otherwise { + readResult1 := vrfReadResult(index)(1) } + } else { + readResult1 := vrfReadResult(index)(1) } + } - // read result from vd need read latency - when(RegNext(vrfReadRequest(index)(2).fire)) { - when(RegNext(record.state.sReadVD)) { + when(readPortFireNext2) { + if (isLastSlot) { + when(sReadNext2) { crossReadMSBOut := vrfReadResult(index)(2) }.otherwise { - source3(index) := vrfReadResult(index)(2) + readResult2 := vrfReadResult(index)(2) } + } else { + readResult2 := vrfReadResult(index)(2) } + } - // VRF cross lane read: - // for each cross lane read access: - // it always access datapath width of VRF in two lanes. - // the source lane will consume: [[crossReadLSBOut]] and [[crossReadMSBOut]] - // it should be sent out to corresponding lane to [[crossLaneRead.bits.sinkIndex]] - - // 1. group all VRF together, index them under the datapath width - // 2. the cross read/write take dual size of datapath width. - // - // example of cross lane read - // 0| 1| 2| 3| 4| 5| 6| 7 - // 8| 9|10|11|12|13|14|15 - // 16|17|18|19|20|21|22|23 - // 24|25|26|27|28|29|30|31 - - /** for cross lane read LSB is read from VRF, ready to send out to ring. */ - val crossReadDataReadyLSB: Bool = record.state.sCrossReadLSB && RegNext(record.state.sCrossReadLSB) - - /** for cross lane read MSB is read from VRF, ready to send out to ring. */ - val crossReadDataReadyMSB: Bool = record.state.sCrossReadMSB && RegNext(record.state.sCrossReadMSB) + if (isLastSlot) { + // cross read + /** for dequeue group counter match */ + readBusDequeueGroup := groupCounterInStage1 + /** The data to be sent is ready + * need sCrossReadLSB since sCrossReadLSBNext may assert after s1fire. + */ + val crossReadDataReadyLSB: Bool = (sCrossReadLSBNext ++ sCrossReadLSB).reduce(_ && _) + val crossReadDataReadyMSB: Bool = (sCrossReadMSBNext ++ sCrossReadMSB).reduce(_ && _) /** read data from RF, try to send cross lane read LSB data to ring */ - val tryCrossReadSendLSB: Bool = - crossReadDataReadyLSB && !record.state.sSendCrossReadResultLSB && slotOccupied.head + val tryCrossReadSendLSB: Bool = crossReadDataReadyLSB && !sSendCrossReadResultLSB.get && valid1 /** read data from RF, try to send cross lane read MSB data to ring */ - val tryCrossReadSendMSB: Bool = - crossReadDataReadyMSB && !record.state.sSendCrossReadResultMSB && slotOccupied.head + val tryCrossReadSendMSB: Bool = crossReadDataReadyMSB && !sSendCrossReadResultMSB.get && valid1 // TODO: use [[record.state.sSendCrossReadResultLSB]] crossLaneRead.bits.sinkIndex := (!tryCrossReadSendLSB) ## laneIndex(parameter.laneNumberBits - 1, 1) crossLaneRead.bits.isTail := laneIndex(0) crossLaneRead.bits.sourceIndex := laneIndex crossLaneRead.bits.instructionIndex := record.laneRequest.instructionIndex - crossLaneRead.bits.counter := record.groupCounter - // TODO: use [[record.state.sSendCrossReadResultLSB]] + crossLaneRead.bits.counter := groupCounterInStage1 + // TODO: use [[record.state.sSendCrossReadResultLSB]] -> MSB may be ready earlier crossLaneRead.bits.data := Mux(tryCrossReadSendLSB, crossReadLSBOut, crossReadMSBOut) crossLaneRead.valid := tryCrossReadSendLSB || tryCrossReadSendMSB - // VRF cross write - /** execute in ALU, try to send cross lane write LSB data to ring */ - val tryCrossWriteSendLSB = record.state.sExecute && !record.state.sCrossWriteLSB && slotOccupied.head - - /** execute in ALU, try to send cross lane write MSB data to ring */ - val tryCrossWriteSendMSB = record.state.sExecute && !record.state.sCrossWriteMSB && slotOccupied.head - crossLaneWrite.bits.sinkIndex := laneIndex(parameter.laneNumberBits - 2, 0) ## (!tryCrossWriteSendLSB) - crossLaneWrite.bits.sourceIndex := laneIndex - crossLaneWrite.bits.isTail := laneIndex(parameter.laneNumberBits - 1) - crossLaneWrite.bits.instructionIndex := record.laneRequest.instructionIndex - crossLaneWrite.bits.counter := record.groupCounter - crossLaneWrite.bits.data := Mux(tryCrossWriteSendLSB, crossWriteDataLSBHalf, crossWriteDataMSBHalf) - crossLaneWrite.bits.mask := Mux(tryCrossWriteSendLSB, crossWriteMaskLSBHalf, crossWriteMaskMSBHalf) - crossLaneWrite.valid := tryCrossWriteSendLSB || tryCrossWriteSendMSB - - // cross read receive. - when(readBusDequeue.valid) { - assert(readBusDequeue.bits.instructionIndex === record.laneRequest.instructionIndex) - when(readBusDequeue.bits.isTail) { - record.state.wCrossReadMSB := true.B - crossReadMSBIn := readBusDequeue.bits.data - }.otherwise { - record.state.wCrossReadLSB := true.B - crossReadLSBIn := readBusDequeue.bits.data - } - } - - // todo: handling self cross read for first and end lane. - // maintain cross read send state machine. when(crossLaneReadReady && crossLaneRead.valid) { when(tryCrossReadSendLSB) { - record.state.sSendCrossReadResultLSB := true.B + sSendCrossReadResultLSB.foreach(_ := true.B) }.otherwise { - record.state.sSendCrossReadResultMSB := true.B - } - } - // maintain cross write send state machine. - when(crossLaneWriteReady && crossLaneWrite.valid) { - record.state.sCrossWriteLSB := true.B - when(record.state.sCrossWriteLSB) { - record.state.sCrossWriteMSB := true.B + sSendCrossReadResultMSB.foreach(_ := true.B) } } - /** sew: - * 0: - * executeIndex: - * 0: mask = 0011, head - * 1: mask = 1100, head - * 2: mask = 0011, tail - * 3: mask = 1100, tail - * 1: - * executeIndex: - * 0: mask = 1111, head - * 2: mask = 1111, tail - * - * 2: not valid in SEW = 2 - */ - when(dataDequeueFire && !masked) { - when(record.executeIndex(1)) { - // update tail - crossWriteDataMSBHalf := - Mux( - record.csr.vSew(0), - dataDequeue(parameter.datapathWidth - 1, parameter.halfDatapathWidth), - Mux( - record.executeIndex(0), - dataDequeue(parameter.halfDatapathWidth - 1, 0), - crossWriteDataMSBHalf(parameter.datapathWidth - 1, parameter.halfDatapathWidth) - ) - ) ## Mux( - !record.executeIndex(0) || record.csr.vSew(0), - dataDequeue(parameter.halfDatapathWidth - 1, 0), - crossWriteDataMSBHalf(parameter.halfDatapathWidth - 1, 0) - ) - crossWriteMaskMSBHalf := - (record.executeIndex(0) || record.csr.vSew(0) || crossWriteMaskMSBHalf(1)) ## - (!record.executeIndex(0) || record.csr.vSew(0) || crossWriteMaskMSBHalf(0)) + // cross read receive. todo: move out slot + when(readBusDequeue.valid) { + assert(readBusDequeue.bits.instructionIndex === record.laneRequest.instructionIndex) + when(readBusDequeue.bits.isTail) { + crossReadMSBIn := readBusDequeue.bits.data }.otherwise { - // update head - crossWriteDataLSBHalf := - Mux( - record.csr.vSew(0), - dataDequeue(parameter.datapathWidth - 1, parameter.halfDatapathWidth), - Mux( - record.executeIndex(0), - dataDequeue(parameter.halfDatapathWidth - 1, 0), - crossWriteDataLSBHalf(parameter.datapathWidth - 1, parameter.halfDatapathWidth) - ) - ) ## Mux( - !record.executeIndex(0) || record.csr.vSew(0), - dataDequeue(parameter.halfDatapathWidth - 1, 0), - crossWriteDataLSBHalf(parameter.halfDatapathWidth - 1, 0) - ) - crossWriteMaskLSBHalf := - (record.executeIndex(0) || record.csr.vSew(0) || crossWriteMaskLSBHalf(1)) ## - (!record.executeIndex(0) || record.csr.vSew(0) || crossWriteMaskLSBHalf(0)) + crossReadLSBIn := readBusDequeue.bits.data } } + } - // clear mask when group change. - when(record.state.asUInt.andR) { - crossWriteMaskLSBHalf := 0.U - crossWriteMaskMSBHalf := 0.U - } - - /** we have used mask inside mask group, and need to request from [[V]] */ - val maskNeedUpdate = !nextOrR && (!alwaysNextGroup || lastGroupForMask) - - /** the MSB part of [[nextElementIndexInLane]] - * [[nextIndex]] is the log2 of [[parameter.datapathWidth]], - * it contains the higher bits of element index of lanes - * thus [[nextGroupCountMSB]] has the MSB suffix. - */ - val nextGroupCountMSB: UInt = Mux1H( - vSew1H(1, 0), - Seq( - record.groupCounter(parameter.groupNumberBits - 1, parameter.groupNumberBits - 3), - false.B ## record.groupCounter(parameter.groupNumberBits - 1, parameter.groupNumberBits - 2) - ) - ) + maskNeedUpdate - - /** the next element index in lane to execute */ - val nextElementIndexInLane = nextGroupCountMSB ## nextIndex - - /** the data offset of next element in lane to execute - * TODO: [[record.csr.vSew]] only has value 0,1,2 for 32bits. - */ - val nextElementOffset = (nextElementIndexInLane << record.csr.vSew).asUInt - - /** calculate the next group with the valid mask. - * if ![[alwaysNextGroup]] and some group has no valid mask, - * it will be skipped. - */ - val nextGroupMasked: UInt = nextElementOffset(parameter.groupNumberBits + 1, 2) - - /** which execution group to execute next. */ - val nextGroupCount = Mux( - alwaysNextGroup, - record.groupCounter + 1.U, - nextGroupMasked - ) - - /** the left element in this group has been filter out by the mask. */ - val maskFilterEnd = skipEnable && (nextGroupMasked =/= record.groupCounter) - - /** this is the last group of the current instruction. */ - val lastExecuteGroup: Bool = lastGroupForLane === record.groupCounter - - /** the index of execute index in this lane. - * this is used to handle the misalignment of vl. - * for [[lastExecuteGroup]] and [[isEndLane]], use [[lastElementExecuteIndex]] as the index. - * TODO: where is the logic of [[Decoder.maskLogic]] - */ - val lastExecuteIndex = Mux( - lastExecuteGroup && isEndLane && !record.laneRequest.decodeResult(Decoder.maskLogic), - lastElementExecuteIndex, - slotGroupFinishedIndex + // --- stage 1_1 end & stage 2 start --- + val executionQueue: Queue[LaneExecuteStage] = + Module(new Queue(new LaneExecuteStage(parameter)(isLastSlot), parameter.executionQueueSize)) + + val s2Ready = Wire(Bool()) + val s2Valid = valid1 && stage1Finish + val s2Fire: Bool = s2Ready && s2Valid + val valid2 = RegInit(false.B) + // need clear mask format result when mask group change + val updateMaskResult: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) + // backpressure for stage 1 + s1Ready := !valid1 || (stage1Finish && s2Ready) + // update 'valid1' + when(s1Fire ^ s2Fire) {valid1 := s1Fire} + val s2ExecuteOver = Wire(Bool()) + + // execution result from execute unit + val executionResult = RegInit(0.U(parameter.datapathWidth.W)) + + /** mask format result for current `mask group` */ + val maskFormatResultForGroup: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(parameter.maskGroupWidth.W))) + + /** cross write LSB mask to send out to other lanes. */ + val Stage2crossWriteLSB = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + + /** cross write MSB data to send out to other lanes. */ + val Stage2crossWriteMSB = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + // pipe from stage 0 + val sSendResponseInStage2 = Option.when(isLastSlot)(RegEnable(sSendResponseInStage1.get, true.B, s2Fire)) + // ffo success in current data group? + val ffoSuccessImStage2: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B)) + + // executionQueue enqueue + executionQueue.io.enq.bits.pipeData.foreach { data => + data := Mux( + // pipe source1 for gather, pipe v0 for ffo + decodeResult(Decoder.gather) || decodeResult(Decoder.ffo), + readResult0, + readResult1 ) - - /** the next group enqueuing to the execution unit is the last group. */ - val enqGroupEnd = (record.executeIndex === lastExecuteIndex) || maskFilterEnd - - /** the current element group is the last group of this instruction. */ - val elementGroupEnd = (writeIndex === lastExecuteIndex) || maskFilterEnd - - /** for the case that, - * if the the current group is the last group in the instruction - * and it need to cross read VRF. - * some lane doesn't need to execute but only need to read VRF. - * this signal is used to indicate that. - */ - val crossReadVRFOnly = lastGroupForLane < record.groupCounter && noFeedBack - - /** this is the last group of [[Decoder.nr]] type - */ - val nrEnd: Bool = record.groupCounter === record.laneRequest.vs1 ## 3.U(2.W) - - // end the execution. - instructionExecuteFinished(index) := Mux( - nr, - nrEnd, - crossReadVRFOnly || (lastExecuteGroup && elementGroupEnd) || maskLogicWillCompleted + } + executionQueue.io.enq.bits.pipeVD.foreach(_ := readResult2) + executionQueue.io.enq.bits.groupCounter := groupCounterInStage1 + executionQueue.io.enq.bits.mask := Mux1H( + vSew1H, + Seq( + maskForFilterInStage1, + FillInterleaved(2, maskForFilterInStage1(1, 0)), + // todo: handle first masked + FillInterleaved(4, maskForFilterInStage1(0)) ) + ) - // wait for the cross read to finish. - instructionCrossReadFinished := crossReadVRFOnly || readFinish - - /** indicate this is the last VRF write for a instruction for this lane. */ - val lastVRFWrite: Bool = lastGroupForLane < nextGroupCount - /** the next index to execute. */ - val nextExecuteIndex = Mux( - alwaysNextGroup && elementGroupEnd, - 0.U, - nextElementOffset(1, 0) - ) + // 先用一个伪装的执行单元 todo: 等执行单元重构需要替换 + if (true) { + val executionRecord: ExecutionUnitRecord = RegInit(0.U.asTypeOf(new ExecutionUnitRecord(parameter)(isLastSlot))) + + val executeIndex1H: UInt = UIntToOH(executionRecord.executeIndex) + + // state register + val sSendExecuteRequest = RegInit(true.B) + val wExecuteResult = RegInit(true.B) + val executeRequestStateValid: Bool = !sSendExecuteRequest + s2ExecuteOver := sSendExecuteRequest && wExecuteResult + + val source1Select: UInt = Mux(decodeResult(Decoder.vtype), readResult0, record.laneRequest.readFromScalar) + // init register when s2Fire + when(s2Fire) { + executionRecord.crossReadVS2 := decodeResult(Decoder.crossRead) && !decodeResult(Decoder.vwmacc) + executionRecord.bordersForMaskLogic := + (groupCounterInStage1 === record.lastGroupForInstruction && record.isLastLaneForMaskLogic) + executionRecord.mask := maskInStage1 + executionRecord.source := VecInit(Seq(source1Select, readResult1, readResult2)) + executionRecord.crossReadSource.foreach(_ := crossReadMSBIn ## crossReadLSBIn) + executionRecord.groupCounter := groupCounterInStage1 + sSendExecuteRequest := decodeResult(Decoder.dontNeedExecuteInLane) + wExecuteResult := decodeResult(Decoder.dontNeedExecuteInLane) + ffoSuccessImStage2.foreach(_ := false.B) + } /** the byte-level mask of current execution. * sew match: @@ -1591,8 +1033,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val byteMaskForExecution = Mux1H( vSew1H(2, 0), Seq( - UIntToOH(record.executeIndex), - record.executeIndex(1) ## record.executeIndex(1) ## !record.executeIndex(1) ## !record.executeIndex(1), + executeIndex1H, + executionRecord.executeIndex(1) ## executionRecord.executeIndex(1) ## + !executionRecord.executeIndex(1) ## !executionRecord.executeIndex(1), 15.U(4.W) ) ) @@ -1602,7 +1045,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ def CollapseOperand(data: UInt, enable: Bool = true.B, sign: Bool = false.B): UInt = { val dataMasked: UInt = data & bitMaskForExecution - val select: UInt = Mux(enable, vSew1H(2, 0), 4.U(3.W)) + val select: UInt = Mux(enable, vSew1H(2, 0), 4.U(3.W)) // when sew = 0 val collapse0 = Seq.tabulate(4)(i => dataMasked(8 * i + 7, 8 * i)).reduce(_ | _) // when sew = 1 @@ -1620,8 +1063,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // 有2 * sew 的操作数需要折叠 def CollapseDoubleOperand(sign: Bool = false.B): UInt = { val doubleBitEnable = FillInterleaved(16, byteMaskForExecution) - val doubleDataMasked: UInt = (crossReadMSBIn ## crossReadLSBIn) & doubleBitEnable - val select: UInt = vSew1H(1, 0) + val doubleDataMasked: UInt = executionRecord.crossReadSource.get & doubleBitEnable + val select: UInt = vSew1H(1, 0) // when sew = 0 val collapse0 = Seq.tabulate(4)(i => doubleDataMasked(16 * i + 15, 16 * i)).reduce(_ | _) // when sew = 1 @@ -1635,36 +1078,34 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) } - /** for reduce type, use the result of reduce as input of src1. - * for [[Decoder.popCount]], src1 is used for mask. + /** collapse the dual SEW size operand for cross read. + * it can be vd or src2. */ - val src1IsReduceResult: Bool = reduceType && !record.laneRequest.decodeResult(Decoder.popCount) + val doubleCollapse = Option.when(isLastSlot)(CollapseDoubleOperand(!decodeResult(Decoder.unsigned1))) /** src1 for the execution * src1 has three types: V, I, X. * only V type need to use [[CollapseOperand]] */ val finalSource1 = CollapseOperand( - // for reduce type, use the result of reduce as input of src1. - // if not use result from VRF - Mux(src1IsReduceResult, reduceResult(index), source1(index)), - decodeResult(Decoder.vtype) && !src1IsReduceResult, + // A will be updated every time it is executed, so you can only choose here + Mux(decodeResult(Decoder.red) && !decodeResult(Decoder.maskLogic), reduceResult, executionRecord.source.head), + decodeResult(Decoder.vtype) && (!decodeResult(Decoder.red) || decodeResult(Decoder.maskLogic)), !decodeResult(Decoder.unsigned0) ) - /** collapse the dual SEW size operand for cross read. - * it can be vd or src2. - */ - val doubleCollapse = CollapseDoubleOperand(!decodeResult(Decoder.unsigned1)) - /** src2 for the execution, * need to take care of cross read. */ - val finalSource2 = Mux( - crossReadVS2, - doubleCollapse, - CollapseOperand(source2(index), true.B, !decodeResult(Decoder.unsigned1)) - ) + val finalSource2 = if (isLastSlot) { + Mux( + executionRecord.crossReadVS2, + doubleCollapse.get, + CollapseOperand(executionRecord.source(1), true.B, !decodeResult(Decoder.unsigned1)) + ) + } else { + CollapseOperand(executionRecord.source(1), true.B, !decodeResult(Decoder.unsigned1)) + } /** source3 有两种:adc & ma, c等处理mask的时候再处理 * two types of source3: @@ -1673,7 +1114,33 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ * * this line only handle the first type. */ - val finalSource3 = Mux(decodeResult(Decoder.vwmacc), doubleCollapse, CollapseOperand(source3(index))) + val finalSource3: UInt = if (isLastSlot) { + Mux( + decodeResult(Decoder.vwmacc), + doubleCollapse.get, + CollapseOperand(executionRecord.source(2)) + ) + }else { + CollapseOperand(executionRecord.source(2)) + } + + val maskAsInput = Mux1H( + vSew1H(2, 0), + Seq( + (UIntToOH(executionRecord.executeIndex) & executionRecord.mask).orR, + Mux(executionRecord.executeIndex(1), executionRecord.mask(1), executionRecord.mask(0)), + executionRecord.mask(0) + ) + ) + + /** use mask to fix the case that `vl` is not in the multiple of [[parameter.datapathWidth]]. + * it will fill the LSB of mask to `0`, mask it to not execute those elements. + */ + val lastGroupMask = scanRightOr(UIntToOH(record.csr.vl(parameter.datapathWidthBits - 1, 0))) >> 1 + + /** if [[executionRecord.bordersForMaskLogic]], + * use [[lastGroupMask]] to mask the result otherwise use [[fullMask]]. */ + val maskCorrect = Mux(executionRecord.bordersForMaskLogic, lastGroupMask, fullMask) // logic request. val logicRequest = Wire(new MaskedLogicRequest(parameter.datapathWidth)) @@ -1683,15 +1150,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ logicRequest.src(3) := finalSource3 logicRequest.opcode := decodeResult(Decoder.uop) logicRequests(index) := maskAnd( - // TODO: remove !decodeResult(Decoder.other) - slotOccupied(index) && decodeResult(Decoder.logic) && !decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.logic), logicRequest ) // add request val adderRequest = Wire(new LaneAdderReq(parameter.datapathWidth)) adderRequest.src := VecInit(Seq(finalSource1, finalSource2)) - adderRequest.mask := maskAsInput + adderRequest.mask := maskAsInput && decodeResult(Decoder.maskSource) adderRequest.opcode := decodeResult(Decoder.uop) adderRequest.sign := !decodeResult(Decoder.unsigned1) adderRequest.reverse := decodeResult(Decoder.reverse) @@ -1700,7 +1166,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ adderRequest.vxrm := record.csr.vxrm adderRequest.vSew := record.csr.vSew adderRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.adder) && !decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.adder), adderRequest ) @@ -1709,12 +1175,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ shiftRequest.src := finalSource2 // 2 * sew has another 1 bit signal. shiftRequest.shifterSize := Mux1H( - Mux(needCrossRead, vSew1H(1, 0), vSew1H(2, 1)), + Mux(executionRecord.crossReadVS2, vSew1H(1, 0), vSew1H(2, 1)), Seq(false.B ## finalSource1(3), finalSource1(4, 3)) ) ## finalSource1(2, 0) shiftRequest.opcode := decodeResult(Decoder.uop) shiftRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.shift) && !decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.shift), shiftRequest ) shiftRequest.vxrm := record.csr.vxrm @@ -1727,7 +1193,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ mulRequest.vSew := record.csr.vSew mulRequest.vxrm := record.csr.vxrm multiplerRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.multiplier) && !decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.multiplier), mulRequest ) @@ -1736,280 +1202,455 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ divRequest.src := VecInit(Seq(finalSource1, finalSource2)) divRequest.rem := decodeResult(Decoder.uop)(0) divRequest.sign := !decodeResult(Decoder.unsigned0) - divRequest.index := record.executeIndex + divRequest.index := executionRecord.executeIndex dividerRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.divider) && !decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.divider), divRequest ) // other val otherRequest: OtherUnitReq = Wire(Output(new OtherUnitReq(parameter))) otherRequest.src := VecInit(Seq(finalSource1, finalSource2, finalSource3)) - otherRequest.popInit := reduceResult(index) + otherRequest.popInit := reduceResult otherRequest.opcode := decodeResult(Decoder.uop) otherRequest.imm := record.laneRequest.vs1 otherRequest.laneIndex := laneIndex - otherRequest.groupIndex := record.groupCounter - otherRequest.executeIndex := record.executeIndex + otherRequest.groupIndex := executionRecord.groupCounter + otherRequest.executeIndex := executionRecord.executeIndex otherRequest.sign := !decodeResult(Decoder.unsigned0) - // todo: vmv.v.* decode to mv instead of merge or delete mv otherRequest.mask := maskAsInput || !record.laneRequest.mask otherRequest.complete := record.ffoByOtherLanes || record.selfCompleted otherRequest.maskType := record.laneRequest.mask otherRequest.vSew := record.csr.vSew otherRequest.vxrm := record.csr.vxrm otherRequests(index) := maskAnd( - slotOccupied(index) && decodeResult(Decoder.other), + executeRequestStateValid && decodeResult(Decoder.other), otherRequest ) - // mask request - // TODO: remove Output - val maskRequest: LaneResponse = Wire(Output(new LaneResponse(parameter))) + instructionTypeVec(index) := VecInit( + Seq( + decodeResult(Decoder.logic), + decodeResult(Decoder.adder), + decodeResult(Decoder.shift), + decodeResult(Decoder.multiplier), + decodeResult(Decoder.divider), + decodeResult(Decoder.other) + ) + ).asUInt - /** mask type instructions don't need to communicate with scheduler every cycle - * only mask destination type instructions doesn't align with state machine in scheduler - * we use [[maskTypeDestinationWriteReady]] to distinguish this - * - * there are three types: - * - for each execution group, need request, e.g. find first one, and indexed - * - for each mask group, e.g. [[Decoder.maskDestination]] - * - only need to send once at last, [[Decoder.red]] and [[Decoder.popCount]] - */ - val canSendMaskRequest = needResponse && readFinish && record.state.sExecute - - /** in this slot, the [[maskRequest]] is valid. */ - val maskRequestValid = canSendMaskRequest && !record.state.sSendResponse - maskRequest.data := Mux( - // todo: decode - // TODO: remove !record.laneRequest.loadStore - record.laneRequest.decodeResult(Decoder.maskDestination) && !record.laneRequest.loadStore, - maskFormatResult, - Mux( - reduceType, - reduceResult(index), - Mux( - record.laneRequest.decodeResult(Decoder.gather) && !record.laneRequest.loadStore, - source1(index), - Mux( - record.laneRequest.decodeResult(Decoder.ffo) && !record.laneRequest.loadStore, - ffoIndexReg, - source2(index) - ) - ) + executeEnqueueValid(index) := maskAnd( + executeRequestStateValid, + instructionTypeVec(index) + ) + + /** select from VFU, send to [[executionResult]], [[Stage2crossWriteLSB]], [[Stage2crossWriteMSB]]. */ + val dataDequeue: UInt = Mux1H(instructionTypeVec(index), executeDequeueData) + + val executeEnqueueFireForSlot: Bool = (instructionTypeVec(index) & executeEnqueueFire).orR + + /** fire of [[dataDequeue]] */ + val executeDequeueFireForSlot: Bool = (instructionTypeVec(index) & executeDequeueFire).orR + + // mask reg for filtering + val maskForFilter = FillInterleaved(4, maskNotMaskedElement) | executionRecord.mask + // current one hot depends on execute index + val currentOHForExecuteGroup: UInt = UIntToOH(executionRecord.executeIndex) + // Remaining to be requested + val remainder: UInt = maskForFilter & (~scanRightOr(currentOHForExecuteGroup)).asUInt + // Finds the first unfiltered execution. + val nextIndex1H: UInt = ffo(remainder) + + // There are no more left. + val isLastRequestForThisGroup: Bool = + Mux1H(vSew1H, Seq(!remainder.orR, !remainder(1, 0).orR, true.B)) + + /** the next index to execute. + * @note Requests into this disguised execution unit are not executed on the spot + * */ + val nextExecuteIndex: UInt = Mux1H( + vSew1H(1, 0), + Seq( + OHToUInt(nextIndex1H), + // Mux(remainder(0), 0.U, 2.U) + !remainder(0) ## false.B + ) + ) + + // next execute index if data group change + val nextExecuteIndexForNextGroup = Mux1H( + vSew1H(1, 0), + Seq( + OHToUInt(ffo(maskForFilterInStage1)), + !maskForFilterInStage1(0) ## false.B, ) ) - maskRequest.toLSU := record.laneRequest.loadStore - maskRequest.instructionIndex := record.laneRequest.instructionIndex - maskRequest.ffoSuccess := record.selfCompleted - maskRequests(index) := maskAnd(slotOccupied(index) && maskRequestValid, maskRequest) - maskRequestValids(index) := maskRequestValid - - // wResponseFeedback state machine control - when( - laneResponseFeedback.valid && laneResponseFeedback.bits.instructionIndex === record.laneRequest.instructionIndex - ) { - record.state.wResponseFeedback := true.B + + // update execute index + when(executeEnqueueFireForSlot || s2Fire) { + executionRecord.executeIndex := Mux(s2Fire, nextExecuteIndexForNextGroup, nextExecuteIndex) } - // TODO: why no ready? - when(maskRequestValid) { - record.state.sSendResponse := true.B + when(executeEnqueueFireForSlot && isLastRequestForThisGroup) { + sSendExecuteRequest := true.B } - // assign type of this slot. - instructionTypeVec(index) := record.laneRequest.instType - // only long latency(div) mask is located at the execution unit - // others are located at the VRF write - executeEnqueueValid(index) := maskAnd( - readFinish && !record.state.sExecute && !maskedLongLatency, - instructionTypeVec(index) + // execute response finish + val responseFinish: Bool = Mux( + decodeResult(Decoder.divider), + executeDequeueFireForSlot && sSendExecuteRequest, + executeEnqueueFireForSlot && isLastRequestForThisGroup ) - // change element in execution unit: - // element is issued, or don't need execution(in long latency pipe), need next element. - when((instructionTypeVec(index) & executeEnqueueFire).orR || maskedLongLatency) { - when(enqGroupEnd) { - record.state.sExecute := true.B - }.otherwise { - // TODO: move out from enqGroupEnd - record.executeIndex := nextExecuteIndex - } - // TODO: record.executeIndex := nextExecuteIndex + when(responseFinish) { + wExecuteResult := true.B } - // TODO: this is a dynamic shift logic, but if we switch to parallel execution unit, we don't need it anymore. - val executeResult = (dataDequeue << dataOffset).asUInt(parameter.datapathWidth - 1, 0) - // TODO: remove it. - val writeByteEnable = Mux1H( + /** the index to write to VRF in [[parameter.dataPathByteWidth]]. + * for long latency pipe, the index will follow the pipeline. + */ + val writeIndex = Mux( + record.laneRequest.decodeResult(Decoder.divider), + divWriteIndex, + executionRecord.executeIndex + ) + + val writeIndex1H = UIntToOH(writeIndex) + + /** VRF byte level mask */ + val writeMaskInByte = Mux1H( vSew1H(2, 0), Seq( - UIntToOH(writeIndex), + writeIndex1H, writeIndex(1) ## writeIndex(1) ## !writeIndex(1) ## !writeIndex(1), - 15.U(4.W) + "b1111".U(4.W) ) ) + + /** VRF bit level mask */ + val writeMaskInBit: UInt = FillInterleaved(8, writeMaskInByte) + + /** output of execution unit need to align to VRF in bit level(used in dynamic shift) + * TODO: fix me + */ + val dataOffset: UInt = writeIndex ## 0.U(3.W) + + // TODO: this is a dynamic shift logic, but if we switch to parallel execution unit, we don't need it anymore. + val executeResult = (dataDequeue << dataOffset).asUInt(parameter.datapathWidth - 1, 0) + // execute 1,2,4 times based on SEW, only write VRF when 32 bits is ready. - val resultUpdate: UInt = (executeResult & writeMaskInBit) | (executionResult(index) & (~writeMaskInBit).asUInt) - // VFU is finished, send to VRF. - // TODO: `longLatency && !divBusy && groupEnd && record.state.sExecute` ???????? - when(dataDequeueFire || (longLatency && !divBusy && elementGroupEnd && record.state.sExecute)) { - // this element group is finished, including the long latency pipe. - when(elementGroupEnd && !(divBusy && longLatency)) { - record.state.wExecuteRes := true.B - } + val resultUpdate: UInt = (executeResult & writeMaskInBit) | (executionResult & (~writeMaskInBit).asUInt) + // update execute result + when(executeDequeueFireForSlot) { // update the [[executionResult]] - executionResult(index) := resultUpdate + executionResult := resultUpdate // the find first one instruction is finished in this lane - record.selfCompleted := otherResponse.ffoSuccess + ffoSuccessImStage2.foreach(_ := otherResponse.ffoSuccess) when(otherResponse.ffoSuccess && !record.selfCompleted) { - ffoIndexReg := record.groupCounter ## Mux1H( + ffoIndexReg := record.lastGroupForInstruction ## Mux1H( vSew1H, Seq( - record.executeIndex ## otherResponse.data(2, 0), - record.executeIndex(1) ## otherResponse.data(3, 0), + executionRecord.executeIndex ## otherResponse.data(2, 0), + executionRecord.executeIndex(1) ## otherResponse.data(3, 0), otherResponse.data(4, 0) ) ) } - // update [[vrfWriteMask]], for long latency pipe, only update when response. - // if reduce type, - when(!masked || (longLatency && divWrite)) { - record.vrfWriteMask := record.vrfWriteMask | writeByteEnable - // TODO: move out: - // when(!masked && reduceType) { - // reduceResult(index) := dataDequeue - // } - when(reduceType) { - reduceResult(index) := dataDequeue + // update cross-lane write data + /** sew: + * 0: + * executeIndex: + * 0: mask = 0011, head + * 1: mask = 1100, head + * 2: mask = 0011, tail + * 3: mask = 1100, tail + * 1: + * executeIndex: + * 0: mask = 1111, head + * 2: mask = 1111, tail + * + * 2: not valid in SEW = 2 + */ + if (isLastSlot) { + when(executionRecord.executeIndex(1)) { + Stage2crossWriteMSB.foreach { crossWriteData => + // update tail + crossWriteData := + Mux( + record.csr.vSew(0), + dataDequeue(parameter.datapathWidth - 1, parameter.halfDatapathWidth), + Mux( + executionRecord.executeIndex(0), + dataDequeue(parameter.halfDatapathWidth - 1, 0), + crossWriteData(parameter.datapathWidth - 1, parameter.halfDatapathWidth) + ) + ) ## Mux( + !executionRecord.executeIndex(0) || record.csr.vSew(0), + dataDequeue(parameter.halfDatapathWidth - 1, 0), + crossWriteData(parameter.halfDatapathWidth - 1, 0) + ) + } + }.otherwise { + Stage2crossWriteLSB.foreach { crossWriteData => + crossWriteData := + Mux( + record.csr.vSew(0), + dataDequeue(parameter.datapathWidth - 1, parameter.halfDatapathWidth), + Mux( + executionRecord.executeIndex(0), + dataDequeue(parameter.halfDatapathWidth - 1, 0), + crossWriteData(parameter.datapathWidth - 1, parameter.halfDatapathWidth) + ) + ) ## Mux( + !executionRecord.executeIndex(0) || record.csr.vSew(0), + dataDequeue(parameter.halfDatapathWidth - 1, 0), + crossWriteData(parameter.halfDatapathWidth - 1, 0) + ) + } } } } - /** update value for [[maskFormatResultUpdate]], - * it comes from ALU. - */ - val elementMaskFormatResult: UInt = Mux(adderMaskResp && !masked, current1H, 0.U) + // update mask result + if (isLastSlot) { + val current1HInGroup = Mux1H( + vSew1H(2, 0), + Seq( + // 32bit, 4 bit per data group, it will had 8 data groups -> executeIndex1H << 4 * groupCounter(2, 0) + executeIndex1H << (executionRecord.groupCounter(2, 0) ## 0.U(2.W)), + // 2 bit per data group, it will had 16 data groups -> executeIndex1H << 2 * groupCounter(3, 0) + (executionRecord.executeIndex(1) ## !executionRecord.executeIndex(1)) << + (executionRecord.groupCounter(3, 0) ## false.B), + // 1 bit per data group, it will had 32 data groups -> executeIndex1H << 1 * groupCounter(4, 0) + 1.U << executionRecord.groupCounter(4, 0) + ) + ).asUInt - /** update value for [[maskFormatResult]] */ - val maskFormatResultUpdate: UInt = maskFormatResult | elementMaskFormatResult + /** update value for [[maskFormatResultUpdate]], + * it comes from ALU. + */ + val elementMaskFormatResult: UInt = Mux(adderMaskResp, current1HInGroup, 0.U) - // update [[maskFormatResult]] - when(dataDequeueFire || maskRequestValid) { - maskFormatResult := Mux(maskRequestValid, 0.U, maskFormatResultUpdate) + /** update value for [[maskFormatResultForGroup]] */ + val maskFormatResultUpdate: UInt = maskFormatResultForGroup.get | elementMaskFormatResult + + // update `maskFormatResultForGroup` + when(executeDequeueFireForSlot || updateMaskResult.get) { + maskFormatResultForGroup.foreach(_ := Mux(executeDequeueFireForSlot, maskFormatResultUpdate, 0.U)) + } + // masked element don't update 'reduceResult' + val updateReduceResult = (maskNotMaskedElement || maskAsInput) && executeDequeueFireForSlot + // update `reduceResult` + when( updateReduceResult || updateMaskResult.get) { + reduceResult := Mux(updateReduceResult && decodeResult(Decoder.red), dataDequeue, 0.U) + } } + } - /** if the find first one instruction is finished by other lanes, - * what should be written to VRF. - */ - val ffoWriteVRFByOtherLanes = Mux(record.laneRequest.mask, (~source1(index)).asUInt & source3(index), 0.U) + // --- stage 2 end & stage 3 start --- + // Since top has only one mask processing unit, + // all instructions that interact with top are placed in a single slot + + val s3Valid = valid2 && s2ExecuteOver + val s3Ready = Wire(Bool()) + val s3Fire = s3Valid && s3Ready + // Used to update valid3 without writing vrf + val s3DequeueFire: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) + val valid3: Option[Bool] = Option.when(isLastSlot)(RegInit(0.U(false.B))) + // use for cross-lane write + val groupCounterInStage3: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(7.W))) + val maskInStage3: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(4.W))) + val executionResultInStage3 = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + val pipeDataInStage3 = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + // result for vfirst type instruction + val ffoIndexRegInStage3 = Option.when(isLastSlot)(RegInit(0.U(log2Ceil(parameter.vLen / 8).W))) + // pipe vd for ff0 + val pipeVDInStage3: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + updateMaskResult.foreach(_ := s3Fire && !sSendResponseInStage2.get) + // cross write result + val Stage3crossWriteLSB = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + val Stage3crossWriteMSB = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + + // cross write state + /** schedule cross lane write LSB */ + val sCrossWriteLSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + + /** schedule cross lane write MSB */ + val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + + // data for response to scheduler + val schedulerResponseData: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + + // state for response to scheduler + /** schedule send [[LaneResponse]] to scheduler */ + val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + + /** wait scheduler send [[LaneResponseFeedback]] */ + val wResponseFeedback: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + + val vrfWriteVundle: VRFWriteRequest = new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth + ) + + val vrfWriteQueue: Queue[VRFWriteRequest] = + Module(new Queue(vrfWriteVundle, entries = 1, pipe = false, flow = false)) + valid3.foreach {data => when(s3DequeueFire.get ^ s3Fire) { data := s3Fire }} + + /** Write queue ready or not need to write. */ + val vrfWriteReady: Bool = vrfWriteQueue.io.enq.ready || decodeResult(Decoder.sWrite) + + if (isLastSlot) { + // VRF cross write + /** execute in ALU, try to send cross lane write LSB data to ring */ + val tryCrossWriteSendLSB = valid3.get && !sCrossWriteLSB.get + + /** execute in ALU, try to send cross lane write MSB data to ring */ + val tryCrossWriteSendMSB = valid3.get && !sCrossWriteMSB.get + crossLaneWrite.bits.sinkIndex := laneIndex(parameter.laneNumberBits - 2, 0) ## (!tryCrossWriteSendLSB) + crossLaneWrite.bits.sourceIndex := laneIndex + crossLaneWrite.bits.isTail := laneIndex(parameter.laneNumberBits - 1) + crossLaneWrite.bits.instructionIndex := record.laneRequest.instructionIndex + crossLaneWrite.bits.counter := groupCounterInStage3.get + crossLaneWrite.bits.data := Mux(tryCrossWriteSendLSB, Stage3crossWriteLSB.get, Stage3crossWriteMSB.get) + crossLaneWrite.bits.mask := Mux(tryCrossWriteSendLSB, maskInStage3.get(1, 0), maskInStage3.get(3, 2)) + crossLaneWrite.valid := tryCrossWriteSendLSB || tryCrossWriteSendMSB - /** no need to wait for scheduler */ - val noNeedWaitScheduler: Bool = !canSendMaskRequest || decodeResult(Decoder.scheduler) || schedulerFinish + when(crossLaneWriteReady && crossLaneWrite.valid) { + sCrossWriteLSB.foreach(_ := true.B) + when(sCrossWriteLSB.get) { + sCrossWriteMSB.foreach(_ := true.B) + } + } + // scheduler synchronization + val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) - vrfWriteArbiter(index).valid := - // state machine says: execute is finished(or don't need to execute) and need to write VRF(but not write yet) - record.state.wExecuteRes && !record.state.sWrite && - // read VRF but not execute - readFinish && - // no need to wait for scheduler - (!canSendMaskRequest || decodeResult(Decoder.scheduler) || schedulerFinish) && - // this slot is active - slotActive(index) + // mask request + laneResponse.valid := valid3.get && !sSendResponse.get + laneResponse.bits.data := Mux(decodeResult(Decoder.ffo), ffoIndexRegInStage3.get, pipeDataInStage3.get) + laneResponse.bits.toLSU := record.laneRequest.loadStore + laneResponse.bits.instructionIndex := record.laneRequest.instructionIndex + laneResponse.bits.ffoSuccess := record.selfCompleted + + sSendResponse.foreach(state => when(laneResponse.valid) { state := true.B}) + wResponseFeedback.foreach(state => when(laneResponseFeedback.valid) { state := true.B}) + + when(laneResponseFeedback.valid && slotOccupied(index)) { + when(laneResponseFeedback.bits.complete) { record.ffoByOtherLanes := true.B } + assert(laneResponseFeedback.bits.instructionIndex === record.laneRequest.instructionIndex) + } + + // enqueue write for last slot + vrfWriteQueue.io.enq.valid := valid3.get && schedulerFinish && !decodeResult(Decoder.sWrite) // UInt(5.W) + UInt(3.W), use `+` here - vrfWriteArbiter(index).bits.vd := record.laneRequest.vd + record.groupCounter( + vrfWriteQueue.io.enq.bits.vd := record.laneRequest.vd + groupCounterInStage3.get( parameter.groupNumberBits - 1, parameter.vrfOffsetBits ) - vrfWriteArbiter(index).bits.offset := record.groupCounter + vrfWriteQueue.io.enq.bits.offset := groupCounterInStage3.get - vrfWriteArbiter(index).bits.data := Mux( - record.ffoByOtherLanes, - ffoWriteVRFByOtherLanes, - Mux(nr, source2(index), executionResult(index)) + /** what will write into vrf when ffo type instruction finished by other lanes */ + val completeWrite: UInt = Mux(record.laneRequest.mask, (~pipeDataInStage3.get).asUInt & pipeVDInStage3.get, 0.U) + vrfWriteQueue.io.enq.bits.data := Mux( + decodeResult(Decoder.nr), + pipeDataInStage3.get, + Mux( + record.ffoByOtherLanes, + completeWrite, + executionResultInStage3.get + ) ) + vrfWriteQueue.io.enq.bits.last := DontCare + vrfWriteQueue.io.enq.bits.instructionIndex := record.laneRequest.instructionIndex + vrfWriteQueue.io.enq.bits.mask := maskInStage3.get + + // Handshake + /** Cross-lane writing is over */ + val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) + + s3Ready := !valid3.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) + s3DequeueFire.foreach(_ := valid3.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady) + + //Update the registers of stage3 + when(s3Fire) { + groupCounterInStage3.foreach(_ := executionQueue.io.deq.bits.groupCounter) + maskInStage3.foreach(_ := executionQueue.io.deq.bits.mask) + executionResultInStage3.foreach(_ := executionResult) + // todo: update maskFormatResult & reduceResult + pipeDataInStage3.foreach(_ := Mux( + decodeResult(Decoder.maskDestination), + maskFormatResultForGroup.get, + Mux( + decodeResult(Decoder.red), + reduceResult, + executionQueue.io.deq.bits.pipeData.get + ) + )) + ffoIndexRegInStage3.foreach(_ := ffoIndexReg) + pipeVDInStage3.foreach(_ := executionQueue.io.deq.bits.pipeVD.get) + // cross write data + Stage3crossWriteLSB.foreach(_ := Stage2crossWriteLSB.get) + Stage3crossWriteMSB.foreach(_ := Stage2crossWriteMSB.get) + // init state + (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !decodeResult(Decoder.crossWrite)) + // todo: save mask destination result if needSendResponse at stage 2? + (sSendResponse ++ wResponseFeedback).foreach( + _ := decodeResult(Decoder.scheduler) || sSendResponseInStage2.get + ) - // todo: 是否条件有多余 - vrfWriteArbiter(index).bits.last := instructionExecuteFinished(index) || lastVRFWrite - - vrfWriteArbiter(index).bits.instructionIndex := record.laneRequest.instructionIndex - - vrfWriteArbiter(index).bits.mask := record.vrfWriteMask | Fill(4, nr) + // save scheduler data, todo: select result when update 'executionResultInStage3' + schedulerResponseData.foreach { data => + data := Mux( + record.laneRequest.decodeResult(Decoder.maskDestination), + maskFormatResultForGroup.get, + executionResultInStage3.get + ) + } - when(vrfWriteFire(index)) { - record.state.sWrite := true.B + ffoSuccessImStage2.foreach(record.selfCompleted := _) + // This group found means the next group ended early + record.ffoByOtherLanes := record.ffoByOtherLanes || record.selfCompleted } - // TODO: don't last connect, use WireDefault - instructionFinishedVec(index) := 0.U(parameter.chainingSize.W) - - /** state machine to check the element group is finished. */ - val groupFinishCheckedByStateMachine = Mux( - crossReadVRFOnly, - readFinish, - (readFinish && executeFinish && sendCrossReadResultFinish && crossWriteFinish) && - (!needResponse || schedulerFinish) && - // can update mask or don't need to update mask - (maskRequestFireOH(index) || !maskNeedUpdate) + } else { + // Normal will be one level less + vrfWriteQueue.io.enq.valid := s3Fire + + // UInt(5.W) + UInt(3.W), use `+` here + vrfWriteQueue.io.enq.bits.vd := record.laneRequest.vd + executionQueue.io.deq.bits.groupCounter( + parameter.groupNumberBits - 1, + parameter.vrfOffsetBits ) - // finish group or vl is too small to use this lane - when(groupFinishCheckedByStateMachine || record.instructionFinished) { - // instruction is finished - when( - // the instruction is finished in this slot - (instructionExecuteFinished(index) && - // cross lane read is finished - (!needCrossRead || instructionCrossReadFinished) && - // don't need scheduler feedback - noFeedBack) || - // vl is too small to use this lane - record.instructionFinished - ) { - slotOccupied(index) := false.B - maskFormatResult := 0.U - when(slotOccupied(index)) { - instructionFinishedVec(index) := UIntToOH( - record.laneRequest.instructionIndex(parameter.instructionIndexBits - 2, 0) - ) - } - } - // instruction is not finished, thus element group is changed. - .otherwise { - record.state := record.laneRequest.initState - record.groupCounter := nextGroupCount - record.executeIndex := nextExecuteIndex - record.vrfWriteMask := 0.U - when(maskRequestFireOH(index)) { - record.mask.valid := true.B - record.mask.bits := maskInput - record.maskGroupedOrR := maskGroupedOrR - } - } - } + vrfWriteQueue.io.enq.bits.offset := executionQueue.io.deq.bits.groupCounter - // - when( - laneResponseFeedback.bits.complete && laneResponseFeedback.valid && - laneResponseFeedback.bits.instructionIndex === record.laneRequest.instructionIndex - ) { - // previous elements found first one - record.ffoByOtherLanes := true.B - - // indexed load/store read offset. - // most of mask unit don't need execute. - when(decodeResult(Decoder.dontNeedExecuteInLane)) { - slotOccupied(index) := false.B - } - } - // sending mask group change request to scheduler. - slotMaskRequestVec(index).valid := maskNeedUpdate - slotMaskRequestVec(index).bits := nextGroupCountMSB + vrfWriteQueue.io.enq.bits.data := executionResult + vrfWriteQueue.io.enq.bits.last := DontCare + vrfWriteQueue.io.enq.bits.instructionIndex := record.laneRequest.instructionIndex + vrfWriteQueue.io.enq.bits.mask := executionQueue.io.deq.bits.mask + + // Handshake + s3Ready := vrfWriteQueue.io.enq.ready } + s2Ready := !valid2 || (s2ExecuteOver && s3Ready && executionQueue.io.enq.ready) + when(s2Fire ^ s3Fire) {valid2 := s2Fire} + // s2 enqueue valid & s2 all ready except executionQueue + executionQueue.io.enq.valid := s2Valid && ((s2ExecuteOver && s3Ready) || !valid2) + executionQueue.io.deq.ready := s3Ready && s2ExecuteOver + + // --- stage 3 end & stage 4 start --- + // vrfWriteQueue try to write vrf + vrfWriteArbiter(index).valid := vrfWriteQueue.io.deq.valid + vrfWriteArbiter(index).bits := vrfWriteQueue.io.deq.bits + vrfWriteQueue.io.deq.ready := vrfWriteFire(index) + + pipeClear := !(Seq(valid0, valid1, valid2, vrfWriteQueue.io.deq.valid) ++ valid3).reduce(_ || _) } // Read Ring @@ -2022,7 +1663,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ readBusPort.enq.bits.sinkIndex === laneIndex && // because the ring may send unordered transactions, we need the check the counter on the ring. // TODO: add one depth escape queue to latch the case that transaction on the ring is not the current groupCounter. - readBusPort.enq.bits.counter === slotControl.head.groupCounter + readBusPort.enq.bits.counter === readBusDequeueGroup } // when `readBusDequeueMatch`, local lane must be ready. readBusDequeue.valid := readBusDequeueMatch && readBusPort.enq.valid @@ -2067,7 +1708,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ crossLaneWriteQueue.io.enq.bits.vd := slotControl.head.laneRequest.vd + writeBusPort.enq.bits.counter(3, 1) crossLaneWriteQueue.io.enq.bits.offset := writeBusPort.enq.bits.counter ## writeBusPort.enq.bits.isTail crossLaneWriteQueue.io.enq.bits.data := writeBusPort.enq.bits.data - crossLaneWriteQueue.io.enq.bits.last := instructionExecuteFinished.head && writeBusPort.enq.bits.isTail + crossLaneWriteQueue.io.enq.bits.last := DontCare crossLaneWriteQueue.io.enq.bits.instructionIndex := slotControl.head.laneRequest.instructionIndex crossLaneWriteQueue.io.enq.bits.mask := FillInterleaved(2, writeBusPort.enq.bits.mask) @@ -2112,10 +1753,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ .reduce(_ | _) .asTypeOf(new LaneDivRequest(parameter.datapathWidth)) otherUnit.req := VecInit(otherRequests.map(_.asUInt)).reduce(_ | _).asTypeOf(Output(new OtherUnitReq(parameter))) - laneResponse.bits := VecInit(maskRequests.map(_.asUInt)) - .reduce(_ | _) - .asTypeOf(Output(new LaneResponse(parameter))) - laneResponse.valid := maskRequestValids.asUInt.orR // 执行单元的其他连接 otherResponse := otherUnit.resp lastDivWriteIndexWire := div.index @@ -2205,8 +1842,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ entranceControl.csr := csrInterface entranceControl.laneRequest := laneRequest.bits - // TODO: fix me with decode - entranceControl.state := laneRequest.bits.initState // TODO: in scalar core, raise illegal instruction exception when vstart is nonzero. // see [[https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#37-vector-start-index-csr-vstart]] // "Such implementations are permitted to raise an illegal instruction exception @@ -2229,21 +1864,67 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ entranceControl.maskGroupedOrR := maskGroupedOrR // mask used for VRF write in this group. entranceControl.vrfWriteMask := 0.U - // todo: vStart(2,0) > lane index - // which group to start. - // TODO: set to 0. - entranceControl.groupCounter := (csrInterface.vStart >> 3).asUInt - /** slot inside [[Lane]] is ready to shift. - * don't shift when feedback. + // calculate last group + val lastElementIndex: UInt = (csrInterface.vl - 1.U)(parameter.vlMaxBits - 2, 0) + val requestVSew1H: UInt = UIntToOH(csrInterface.vSew) + + /** For an instruction, the last group is not executed by all lanes, + * here is the last group of the instruction + * xxxxx xxx xx -> vsew = 0 + * xxxxxx xxx x -> vsew = 1 + * xxxxxxx xxx -> vsew = 2 */ - val shiftReady = slotCanShift.asUInt.andR && !laneResponseFeedback.valid + val lastGroupForInstruction: UInt = Mux1H( + requestVSew1H(2, 0), + Seq( + lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits + 2), + lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits + 1), + lastElementIndex(parameter.vlMaxBits - 2, parameter.laneNumberBits) + ) + ) - // handshake - laneRequest.ready := !slotOccupied.head && vrf.instructionWriteReport.ready && shiftReady + /** Which lane the last element is in. */ + val lastLaneIndex: UInt = Mux1H( + requestVSew1H(2, 0), + Seq( + lastElementIndex(parameter.laneNumberBits + 2 - 1, 2), + lastElementIndex(parameter.laneNumberBits + 1 - 1, 1), + lastElementIndex(parameter.laneNumberBits - 1, 0) + ) + ) - // Slot shift logic - when( + /** The relative position of the last lane determines the processing of the last group. */ + val lanePositionLargerThanEndLane: Bool = laneIndex > lastLaneIndex + val isEndLane: Bool = laneIndex === lastLaneIndex + val lastGroupForLane: UInt = lastGroupForInstruction - lanePositionLargerThanEndLane + + // last group for mask logic type + /** xxx xxx xxxxx + * head body tail + */ + val vlTail: UInt = csrInterface.vl(parameter.datapathWidthBits - 1, 0) + val vlBody: UInt = + csrInterface.vl(parameter.datapathWidthBits + parameter.laneNumberBits - 1, parameter.datapathWidthBits) + val vlHead: UInt = csrInterface.vl(parameter.vlMaxBits - 1, parameter.datapathWidthBits + parameter.laneNumberBits) + val lastGroupMask = scanRightOr(UIntToOH(vlTail)) >> 1 + val dataPathMisaligned = vlTail.orR + val maskeDataGroup = (vlHead ## vlBody) - !dataPathMisaligned + val lastLaneIndexForMaskLogic: UInt = maskeDataGroup(parameter.laneNumberBits - 1, 0) + val isLastLaneForMaskLogic: Bool = lastLaneIndexForMaskLogic === laneIndex + val lastGroupCountForMaskLogic: UInt = (maskeDataGroup >> parameter.laneNumberBits).asUInt + + entranceControl.lastGroupForInstruction := Mux( + laneRequest.bits.decodeResult(Decoder.maskLogic), + lastGroupCountForMaskLogic, + lastGroupForLane + ) + + entranceControl.isLastLaneForMaskLogic := + isLastLaneForMaskLogic && dataPathMisaligned && laneRequest.bits.decodeResult(Decoder.maskLogic) + + // slot needs to be moved + slotShiftValid := // the first slot is not occupied !slotOccupied.head && ( @@ -2251,19 +1932,23 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ slotOccupied.asUInt.orR || // new instruction enqueue laneRequest.valid - ) && - // slots is ready to shift - shiftReady - ) { + ) + + /** slot inside [[Lane]] is ready to shift. + * don't shift when feedback. + */ + val shiftReady: Bool = slotCanShift.asUInt.andR && !laneResponseFeedback.valid + + // handshake + laneRequest.ready := !slotOccupied.head && vrf.instructionWriteReport.ready && shiftReady + + // Slot shift logic + when(slotShiftValid && shiftReady) { slotOccupied := VecInit(slotOccupied.tail :+ laneRequest.valid) - source1 := VecInit(source1.tail :+ laneRequest.bits.readFromScalar) slotControl := VecInit(slotControl.tail :+ entranceControl) - executionResult := VecInit(executionResult.tail :+ 0.U(parameter.datapathWidth.W)) - reduceResult := VecInit(reduceResult.tail :+ 0.U(parameter.datapathWidth.W)) - source2 := VecInit(source2.tail :+ 0.U(parameter.datapathWidth.W)) - source3 := VecInit(source3.tail :+ 0.U(parameter.datapathWidth.W)) - crossWriteMaskLSBHalf := 0.U - crossWriteMaskMSBHalf := 0.U + maskGroupCountVec := VecInit(maskGroupCountVec.tail :+ 0.U(parameter.maskGroupSizeBits.W)) + maskIndexVec := VecInit(maskIndexVec.tail :+ 0.U(log2Ceil(parameter.maskGroupWidth).W)) + pipeFinishVec := VecInit(pipeFinishVec.tail :+ false.B) } vrf.flush := maskUnitFlushVrf @@ -2273,7 +1958,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.offset := 0.U //todo vrf.instructionWriteReport.bits.vdOffset := 0.U vrf.instructionWriteReport.bits.vd.bits := laneRequest.bits.vd - vrf.instructionWriteReport.bits.vd.valid := !laneRequest.bits.initState.sWrite || (laneRequest.bits.loadStore && !laneRequest.bits.store) + vrf.instructionWriteReport.bits.vd.valid := !laneRequest.bits.decodeResult(Decoder.sWrite) || (laneRequest.bits.loadStore && !laneRequest.bits.store) vrf.instructionWriteReport.bits.vs2 := laneRequest.bits.vs2 vrf.instructionWriteReport.bits.vs1.bits := laneRequest.bits.vs1 vrf.instructionWriteReport.bits.vs1.valid := laneRequest.bits.decodeResult(Decoder.vtype) @@ -2290,7 +1975,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.widen := laneRequest.bits.decodeResult(Decoder.crossWrite) vrf.instructionWriteReport.bits.stFinish := false.B vrf.instructionWriteReport.bits.mul := Mux(csrInterface.vlmul(2), 0.U, csrInterface.vlmul(1, 0)) - vrf.lsuLastReport := lsuLastReport + // clear record by instructionFinished + vrf.lsuLastReport := lsuLastReport | instructionFinished vrf.lsuWriteBufferClear := lsuVRFWriteBufferClear instructionFinished := instructionFinishedVec.reduce(_ | _) } diff --git a/v/src/RegFile.scala b/v/src/RegFile.scala index 69e7dd1f2..e070cdf2b 100644 --- a/v/src/RegFile.scala +++ b/v/src/RegFile.scala @@ -5,6 +5,8 @@ import chisel3.util._ case class RFParam(depth: Int, readPort: Int = 2, width: Int = 8) { val indexBits: Int = log2Ceil(depth) + // todo: 4 bit for ecc + val memoryWidth: Int = width + 4 } class RegFileReadPort(param: RFParam) extends Bundle { @@ -26,7 +28,7 @@ class RegFile(param: RFParam) extends Module { val readPorts: Vec[RegFileReadPort] = IO(Vec(param.readPort, new RegFileReadPort(param))) val writePort: ValidIO[RegFileWritePort] = IO(Flipped(Valid(new RegFileWritePort(param)))) - val rf: SyncReadMem[UInt] = SyncReadMem(param.depth, UInt(param.width.W)) + val rf: SyncReadMem[UInt] = SyncReadMem(param.depth, UInt(param.memoryWidth.W)) readPorts.foreach(p => p.data := rf(p.addr)) diff --git a/v/src/V.scala b/v/src/V.scala index e0a336330..f7c73e92c 100644 --- a/v/src/V.scala +++ b/v/src/V.scala @@ -1056,9 +1056,10 @@ class V(val parameter: VParameter) extends Module with SerializableModule[VParam /** the index type of instruction is finished. * let LSU to kill the lane slot. + * todo: delete? */ val completeIndexInstruction: Bool = - ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) + ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle val vrfWrite: Vec[DecoupledIO[VRFWriteRequest]] = Wire( Vec( @@ -1081,6 +1082,27 @@ class V(val parameter: VParameter) extends Module with SerializableModule[VParam val source1Select: UInt = Mux(decodeResult(Decoder.gather), gatherData, Mux(decodeResult(Decoder.itype), immSignExtend, source1Extend)) + + // data eew for extend type + val extendDataEEW: Bool = (requestReg.bits.csr.vSew >> decodeResult(Decoder.topUop)(1, 0))(0) + val gather16: Bool = decodeResult(Decoder.gather16) + val vSewSelect: UInt = Mux( + isLoadStoreType, + requestRegDequeue.bits.instruction(13, 12), + Mux( + decodeResult(Decoder.nr) || decodeResult(Decoder.maskLogic), + 2.U, + Mux(gather16, 1.U, Mux(decodeResult(Decoder.extend), extendDataEEW, requestReg.bits.csr.vSew)) + ) + ) + + val evlForLane: UInt = Mux( + decodeResult(Decoder.nr), + // evl for Whole Vector Register Move -> vs1 * (vlen / datapathWidth) + (requestRegDequeue.bits.instruction(17, 15) +& 1.U) ## 0.U(log2Ceil(parameter.vLen / parameter.datapathWidth).W), + requestReg.bits.csr.vl + ) + /** instantiate lanes. * TODO: move instantiate to top of class. */ @@ -1117,6 +1139,9 @@ class V(val parameter: VParameter) extends Module with SerializableModule[VParam laneReady(index) := lane.laneRequest.ready lane.csrInterface := requestReg.bits.csr + // index type EEW Decoded in the instruction + lane.csrInterface.vSew := vSewSelect + lane.csrInterface.vl := evlForLane lane.laneIndex := index.U // - LSU request next offset of group