Skip to content

Commit

Permalink
csr.bus: add CSRElement and CSRMultiplexer.
Browse files Browse the repository at this point in the history
  • Loading branch information
whitequark committed Oct 21, 2019
1 parent 1f20170 commit dc918fc
Show file tree
Hide file tree
Showing 5 changed files with 507 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -2,3 +2,7 @@
*.pyc
/*.egg-info
/.eggs

# tests
*.vcd
*.gtkw
Empty file added nmigen_soc/csr/__init__.py
Empty file.
255 changes: 255 additions & 0 deletions nmigen_soc/csr/bus.py
@@ -0,0 +1,255 @@
from functools import reduce
from nmigen import *
from nmigen import tracer


__all__ = ["CSRElement", "CSRMultiplexer"]


class CSRElement(Record):
"""Peripheral-side CSR interface.
A low-level interface to a single atomically readable and writable register in a peripheral.
This interface supports any register width and semantics, provided that both reads and writes
always succeed and complete in one cycle.
Parameters
----------
width : int
Width of the register.
name : str
Name of the underlying record.
Attributes
----------
r_data : Signal(width)
Read data. Must be always valid, and is sampled when ``r_stb`` is asserted.
r_stb : Signal()
Read strobe. Registers with read side effects should perform the read side effect when this
strobe is asserted.
w_data : Signal(width)
Write data. Valid only when ``w_stb`` is asserted.
w_stb : Signal()
Write strobe. Registers should update their value or perform the write side effect when
this strobe is asserted.
"""
def __init__(self, width, access, *, name=None, src_loc_at=0):
if not isinstance(width, int) or width < 0:
raise ValueError("Width must be a non-negative integer, not {!r}"
.format(width))
if access not in ("r", "w", "rw"):
raise ValueError("Access mode must be one of \"r\", \"w\", or \"rw\", not {!r}"
.format(access))

self.width = int(width)
self.access = access

layout = []
if "r" in self.access:
layout += [
("r_data", width),
("r_stb", 1),
]
if "w" in self.access:
layout += [
("w_data", width),
("w_stb", 1),
]
super().__init__(layout, name=name, src_loc_at=1)


class CSRMultiplexer(Elaboratable):
"""CPU-side CSR interface.
A low-level interface to a set of peripheral CSR registers that implements address-based
multiplexing and atomic updates of wide registers.
Operation
---------
The CSR multiplexer splits each CSR register into chunks according to its data width. Each
chunk is assigned an address, and the first chunk of each register always has the provided
minimum alignment. This allows accessing CSRs of any size using any datapath width.
When the first chunk of a register is read, the value of a register is captured, and reads
from subsequent chunks of the same register return the captured values. When any chunk except
the last chunk of a register is written, the written value is captured; a write to the last
chunk writes the captured value to the register. This allows atomically accessing CSRs larger
than datapath width.
Reads to padding bytes return zeroes, and writes to padding bytes are ignored.
Writes are registered, and add 1 cycle of latency.
Wide registers
--------------
Because the CSR bus conserves logic and routing resources, it is common to e.g. access
a CSR bus with an *n*-bit data path from a CPU with a *k*-bit datapath in cases where CSR
access latency is less important than resource usage. In this case, two strategies are
possible for connecting the CSR bus to the CPU:
* The CPU could access the CSR bus directly (with no intervening logic other than simple
translation of control signals). In this case, the register alignment should be set
to 1, and each *w*-bit register would occupy *ceil(w/n)* addresses from the CPU
perspective, requiring the same amount of memory instructions to access.
* The CPU could also access the CSR bus through a width down-converter, which would issue
*k/n* CSR accesses for each CPU access. In this case, the register alignment should be
set to *k/n*, and each *w*-bit register would occupy *ceil(w/k)* addresses from the CPU
perspective, requiring the same amount of memory instructions to access.
If alignment is greater than 1, it affects which CSR bus write is considered a write to
the last register chunk. For example, if a 24-bit register is used with a 8-bit CSR bus and
a CPU with a 32-bit datapath, a write to this register requires 4 CSR bus writes to complete
and the 4th write is the one that actually writes the value to the register. This allows
determining write latency solely from the amount of addresses the register occupies in
the CPU address space, and the width of the CSR bus.
Parameters
----------
addr_width : int
Address width. At most ``(2 ** addr_width) * data_width`` register bits will be available.
data_width : int
Data width. Registers are accessed in ``data_width`` sized chunks.
alignment : int
Register alignment. The address assigned to each register will be a multiple of
``2 ** alignment``.
Attributes
----------
addr : Signal(addr_width)
Address for reads and writes.
r_data : Signal(data_width)
Read data. Valid on the next cycle after ``r_stb`` is asserted.
r_stb : Signal()
Read strobe. If ``addr`` points to the first chunk of a register, captures register value
and causes read side effects to be performed (if any). If ``addr`` points to any chunk
of a register, latches the captured value to ``r_data``. Otherwise, latches zero
to ``r_data``.
w_data : Signal(data_width)
Write data. Must be valid when ``w_stb`` is asserted.
w_stb : Signal()
Write strobe. If ``addr`` points to the last chunk of a register, writes captured value
to the register and causes write side effects to be performed (if any). If ``addr`` points
to any chunk of a register, latches ``w_data`` to the captured value. Otherwise, does
nothing.
"""
def __init__(self, *, addr_width, data_width, alignment=0):
if not isinstance(addr_width, int) or addr_width <= 0:
raise ValueError("Address width must be a positive integer, not {!r}"
.format(addr_width))
if not isinstance(data_width, int) or data_width <= 0:
raise ValueError("Data width must be a positive integer, not {!r}"
.format(data_width))
if not isinstance(alignment, int) or alignment < 0:
raise ValueError("Alignment must be a non-negative integer, not {!r}"
.format(alignment))

self.addr_width = int(addr_width)
self.data_width = int(data_width)
self.alignment = alignment

self._next_addr = 0
self._elements = dict()

self.addr = Signal(addr_width)
self.r_data = Signal(data_width)
self.r_stb = Signal()
self.w_data = Signal(data_width)
self.w_stb = Signal()

def add(self, element):
"""Add a register.
Arguments
---------
element : CSRElement
Interface of the register.
Return value
------------
An ``(addr, size)`` tuple, where ``addr`` is the address assigned to the first chunk of
the register, and ``size`` is the amount of chunks it takes, which may be greater than
``element.size // self.data_width`` due to alignment.
"""
if not isinstance(element, CSRElement):
raise TypeError("Element must be an instance of CSRElement, not {!r}"
.format(element))

addr = self.align_to(self.alignment)
self._next_addr += (element.width + self.data_width - 1) // self.data_width
size = self.align_to(self.alignment) - addr
self._elements[addr] = element, size
return addr, size

def align_to(self, alignment):
"""Align the next register explicitly.
Arguments
---------
alignment : int
Register alignment. The address assigned to the next register will be a multiple of
``2 ** alignment`` or ``2 ** self.alignment``, whichever is greater.
Return value
------------
Address of the next register.
"""
if not isinstance(alignment, int) or alignment < 0:
raise ValueError("Alignment must be a non-negative integer, not {!r}"
.format(alignment))

align_chunks = 1 << alignment
if self._next_addr % align_chunks != 0:
self._next_addr += align_chunks - (self._next_addr % align_chunks)
return self._next_addr

def elaborate(self, platform):
m = Module()

# Instead of a straightforward multiplexer for reads, use a per-element address comparator,

This comment has been minimized.

Copy link
@jordens

jordens Oct 21, 2019

Member

Are you sure this is correct? In fact you use fanin |= Mux(stb, data, 0) in the reduction in line 234 which is a big one-hot mux chain or (viewed differently) a k-OR tree plus n 2-MUX.

This comment has been minimized.

Copy link
@whitequark

whitequark Oct 21, 2019

Author Contributor

Indeed it is outdated. The original design used DFFRs for the shadow register with the reset input connected to the negation of address match (for any part of the register), but then I realized it doesn't work properly if you try to pipeline reads back-to-back, since the read mux is combinatorial. I could register the address, but that would duplicate all of the address recognition logic, so it's worse than the solution I have here.

Unless you know some clever (and more efficient) solution to this issue that I'm missing, I'm just going to fix the comment.

This comment has been minimized.

Copy link
@whitequark

whitequark Oct 21, 2019

Author Contributor

The original design used DFFRs for the shadow register with the reset input connected to the negation of address match (for any part of the register)

Also, now that I think about it more carefully, even if that design worked, it wouldn't have been any better because you have to use a mux tree anyway to extract the slice of the shadow register.

This comment has been minimized.

Copy link
@jordens

jordens Oct 21, 2019

Member

Yes. I don't think there is a simple way around it.
The only idea is to make sure the read path can be mapped to block RAM. That requires grouping those CSRs that are guaranteed to never be written to from the peripheral side behind a block RAM shadowing them. The RAM then intercepts/shadows CPU writes and supplies all CPU reads. It obviously doesn't work for CSRs that are set by peripherals.
Maybe that's not such a bad idea after all, the "read-write" CSRs could still be done with the current logic tree and all "write-only" CSRs would be extracted into the shadow RAM.

This comment has been minimized.

Copy link
@whitequark

whitequark Oct 21, 2019

Author Contributor

Is there really a write restriction? Sure, you have to handle the case where the CPU and the peripheral write to the register at the same cycle, so you would need to add priority/arbitration logic. But that is already required; though the only generic case that is obviously non-racy that I can imagine would be "r_c1" event flag registers, where clearing the flag at the same cycle as an event occurring would give the event priority. You could, in principle, use a TDP BRAM with both ports in the same clock domain, and add arbitration logic to the existing address comparators, similar to how the synthesizer would add transparency to a TDP BRAM. In the degenerate case, where it can be statically proven by the synthesizer that the writes never collide (really the only case you're interested in), this logic would be removed, and it'd still be correct otherwise.

But I think there's much worse issues with this idea. (1) Yosys can neither infer nor even describe an asymmetric BRAM, so the peripheral port would have the same geometry as the CPU port, which isn't even known at the time when the peripheral is instantiated. I'm not sure if Vivado can infer one from a Verilog macro either, other toolchains almost certainly can't. (2) Peripherals that need random access to a large array of registers precisely one element at a time seem rare. (3) The use cases for 18 Kbits of registers on the CSR bus seem scarce.

If I was trying to solve this, I would add an address + data port on the CSR bus using regular CSR registers, and would access the memory through this window. To compensate for increased latency of non-random access, I would make the address auto-increment, since most CPU accesses are probably sequential. And of course, this allows using SDP or even SP BRAM. But in the first place, I don't really understand why you want this.

This comment has been minimized.

Copy link
@jordens

jordens Oct 21, 2019

Member

That's not what I described. This is not about collisions but about coherency. This would be a single port RAM that collects CPU writes and supplies CPU reads. The CPU does not use parallel access so this is OK. The CPU writes would additionally always go to the actual target registers that the peripherals look at (in parallel). The peripherals never see the RAM. This is always consistent as long as CPU reads from those registers that can be written to by the peripheral bypass the RAM (the same mux architecture as now). This way you save the mux tree for all registers that are never written to by the peripheral. The cost is block ram. This works better if the registers are packed densely in address space, if block RAM is less expensive than the logic otherwise required, and if there are few registers that are changed by peripherals. And it's only relevant if there are lots of CSRs that are read-write from the CPU (for configuration and read-back) but read-only from the peripheral perspective.
It's not my idea. I know that somebody used this in an ASIC where the big MUX tree for CSR read back was prohibitive. But I forget who and where it was.

# clear the shadow register when it does not match, and OR every selected shadow register
# part to form the output. This can save a significant amount of logic; the size of
# a complete k-OR or k-MUX gate tree for n inputs is `s = ceil((n - 1) / (k - 1))`,
# and its logic depth is `ceil(log_k(s))`, but a 4-LUT can implement either a 4-OR or
# a 2-MUX gate.
r_data_fanin = 0

for elem_addr, (elem, elem_size) in self._elements.items():
shadow = Signal(elem.width, name="{}__shadow".format(elem.name))
if "w" in elem.access:
m.d.comb += elem.w_data.eq(shadow)

# Enumerate every address used by the register explicitly, rather than using
# arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer
# carry chains for comparisons, even with a constant. (Register sizes don't have
# to be powers of 2.)
with m.Switch(self.addr):
for chunk_offset in range(elem_size):
chunk_slice = slice(chunk_offset * self.data_width,
(chunk_offset + 1) * self.data_width)
with m.Case(elem_addr + chunk_offset):
if "r" in elem.access:
chunk_r_stb = Signal(self.data_width,
name="{}__r_stb_{}".format(elem.name, chunk_offset))
r_data_fanin |= Mux(chunk_r_stb, shadow[chunk_slice], 0)
if chunk_offset == 0:
m.d.comb += elem.r_stb.eq(self.r_stb)
with m.If(self.r_stb):
m.d.sync += shadow.eq(elem.r_data)
# Delay by 1 cycle, allowing reads to be pipelined.
m.d.sync += chunk_r_stb.eq(self.r_stb)

if "w" in elem.access:
if chunk_offset == elem_size - 1:
# Delay by 1 cycle, avoiding combinatorial paths through
# the CSR bus and into CSR registers.
m.d.sync += elem.w_stb.eq(self.w_stb)
with m.If(self.w_stb):
m.d.sync += shadow[chunk_slice].eq(self.w_data)

with m.Default():
m.d.sync += shadow.eq(0)

m.d.comb += self.r_data.eq(r_data_fanin)

return m
Empty file added nmigen_soc/test/__init__.py
Empty file.

0 comments on commit dc918fc

Please sign in to comment.