gateware: copy phaser (df3825a)

m-labs · Jul 22, 2016 · d83046d · d83046d
1 parent ecda947
commit d83046d
Showing 13 changed files with 1,173 additions and 1 deletion.
diff --git a/artiq/gateware/dsp/__init__.py b/artiq/gateware/dsp/__init__.py
diff --git a/artiq/gateware/dsp/accu.py b/artiq/gateware/dsp/accu.py
@@ -0,0 +1,112 @@
+from migen import *
+from misoc.interconnect.stream import Endpoint
+
+
+class Accu(Module):
+    def __init__(self, width, meta=[]):
+        self.i = Endpoint([("p", width), ("f", width), ("clr", 1)])
+        self.o = Endpoint([("z", width)])
+        self.latency = 1
+
+        ###
+
+        f = Signal.like(self.i.f)
+        p = Signal.like(self.i.p)
+        self.comb += self.i.ack.eq(~self.o.stb | self.o.ack)
+        self.sync += [
+            If(self.o.ack,
+                self.o.stb.eq(0),
+            ),
+            If(self.i.ack,
+                self.o.stb.eq(1),
+                If(self.i.stb,
+                    self.o.z.eq(self.i.p + Mux(self.i.clr, 0, self.o.z + p)),
+                    f.eq(self.i.f),
+                    p.eq(self.i.f - self.i.p),
+                ).Else(
+                    self.o.z.eq(self.o.z + f),
+                )
+            )
+        ]
+
+
+class MCM(Module):
+    def __init__(self, width, constants):
+        n = len(constants)
+        self.i = i = Signal(width)
+        self.o = o = [Signal.like(self.i) for i in range(n)]
+
+        ###
+
+        # TODO: improve MCM
+        assert range(n) == constants
+        assert n <= 9
+
+        if n > 0:
+            self.comb += o[0].eq(0)
+        if n > 1:
+            self.comb += o[1].eq(i)
+        if n > 2:
+            self.comb += o[2].eq(i << 1)
+        if n > 3:
+            self.comb += o[3].eq(i + (i << 1))
+        if n > 4:
+            self.comb += o[4].eq(i << 2)
+        if n > 5:
+            self.comb += o[5].eq(i + i << 2)
+        if n > 6:
+            self.comb += o[6].eq(o[3] << 1)
+        if n > 7:
+            self.comb += o[7].eq((i << 3) - i)
+        if n > 8:
+            self.comb += o[8].eq(i << 3)
+
+
+class PhasedAccu(Module):
+    def __init__(self, width, parallelism=8):
+        self.i = Endpoint([("p", width), ("f", width), ("clr", 1)])
+        self.o = Endpoint([("z{}".format(i), width) for i in
+                           range(parallelism)])
+        self.parallelism = parallelism
+        self.latency = 2
+
+        ###
+
+        a = MCM(width, range(parallelism + 1))
+        self.submodules += a
+        z = [Signal(width) for i in range(parallelism)]
+        o = self.o.payload.flatten()
+        load = Signal()
+        clr = Signal()
+        p = Signal.like(self.i.p)
+        f = Signal.like(self.i.f)
+        fp = Signal.like(self.i.f)
+        self.comb += [
+            self.i.ack.eq(self.o.ack),
+            a.i.eq(self.i.f),
+        ]
+
+        self.sync += [
+            If(self.o.ack,
+                self.o.stb.eq(0),
+            ),
+            If(~self.o.stb | self.o.ack,
+                self.o.stb.eq(1),
+                If(load,
+                    load.eq(0),
+                    [oi.eq(Mux(clr, 0, o[0] + fp) + zi)
+                     for oi, zi in zip(o, z)],
+                    fp.eq(f),
+                ).Else(
+                    [oi.eq(oi + fp) for oi in o],
+                ),
+            ),
+            If(self.i.stb & self.i.ack,
+                [zi.eq(self.i.p - Mux(self.i.clr, 0, p) + aoi)
+                 for zi, aoi in zip(z, a.o)],
+                clr.eq(self.i.clr),
+                p.eq(self.i.p),
+                f.eq(a.o[parallelism]),
+                load.eq(1),
+            ),
+        ]
diff --git a/artiq/gateware/dsp/cordic.py b/artiq/gateware/dsp/cordic.py
@@ -0,0 +1,358 @@
+# Copyright 2014-2015 Robert Jordens <jordens@gmail.com>
+#
+# This file is part of redpid.
+#
+# redpid is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# redpid is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with redpid.  If not, see <http://www.gnu.org/licenses/>.
+
+from math import atan, atanh, log, sqrt, pi
+
+from migen import *
+
+
+class TwoQuadrantCordic(Module):
+    """Coordinate rotation digital computer
+
+    Trigonometric, and arithmetic functions implemented using
+    additions/subtractions and shifts.
+
+    http://eprints.soton.ac.uk/267873/1/tcas1_cordic_review.pdf
+
+    http://www.andraka.com/files/crdcsrvy.pdf
+
+    http://zatto.free.fr/manual/Volder_CORDIC.pdf
+
+    The way the CORDIC is executed is controlled by `eval_mode`.
+    If `"iterative"` the stages are iteratively evaluated, one per clock
+    cycle. This mode uses the least amount of registers, but has the
+    lowest throughput and highest latency.  If `"pipelined"` all stages
+    are executed in every clock cycle but separated by registers.  This
+    mode has full throughput but uses many registers and has large
+    latency. If `"combinatorial"`, there are no registers, throughput is
+    maximal and latency is zero. `"pipelined"` and `"combinatorial"` use
+    the same number of shifters and adders.
+
+    The type of trigonometric/arithmetic function is determined by
+    `cordic_mode` and `func_mode`. :math:`g` is the gain of the CORDIC.
+
+        * rotate-circular: rotate the vector `(xi, yi)` by an angle `zi`.
+          Used to calculate trigonometric functions, `sin(), cos(),
+          tan() = sin()/cos()`, or to perform polar-to-cartesian coordinate
+          transformation:
+
+            .. math::
+                x_o = g \\cos(z_i) x_i - g \\sin(z_i) y_i
+
+                y_o = g \\sin(z_i) x_i + g \\cos(z_i) y_i
+
+        * vector-circular: determine length and angle of the vector
+          `(xi, yi)`.  Used to calculate `arctan(), sqrt()` or
+          to perform cartesian-to-polar transformation:
+
+            .. math::
+                x_o = g\\sqrt{x_i^2 + y_i^2}
+
+                z_o = z_i + \\tan^{-1}(y_i/x_i)
+
+        * rotate-hyperbolic: hyperbolic functions of `zi`. Used to
+          calculate hyperbolic functions, `sinh, cosh, tanh = cosh/sinh,
+          exp = cosh + sinh`:
+
+            .. math::
+                x_o = g \\cosh(z_i) x_i + g \\sinh(z_i) y_i
+
+                y_o = g \\sinh(z_i) x_i + g \\cosh(z_i) z_i
+
+        * vector-hyperbolic: natural logarithm `ln(), arctanh()`, and
+          `sqrt()`. Use `x_i = a + b` and `y_i = a - b` to obtain `2*
+          sqrt(a*b)` and `ln(a/b)/2`:
+
+            .. math::
+                x_o = g\\sqrt{x_i^2 - y_i^2}
+
+                z_o = z_i + \\tanh^{-1}(y_i/x_i)
+
+        * rotate-linear: multiply and accumulate (not a very good
+          multiplier implementation):
+
+            .. math::
+                y_o = g(y_i + x_i z_i)
+
+        * vector-linear: divide and accumulate:
+
+            .. math::
+                z_o = g(z_i + y_i/x_i)
+
+    Parameters
+    ----------
+    width : int
+        Bit width of the input and output signals. Defaults to 16. Input
+        and output signals are signed.
+    widthz : int
+        Bit with of `zi` and `zo`. Defaults to the `width`.
+    stages : int or None
+        Number of CORDIC incremental rotation stages. Defaults to
+        `width + min(1, guard)`.
+    guard : int or None
+        Add guard bits to the intermediate signals. If `None`,
+        defaults to `guard = log2(width)` which guarantees accuracy
+        to `width` bits.
+    eval_mode : str, {"iterative", "pipelined", "combinatorial"}
+    cordic_mode : str, {"rotate", "vector"}
+    func_mode : str, {"circular", "linear", "hyperbolic"}
+        Evaluation and arithmetic mode. See above.
+
+    Attributes
+    ----------
+    xi, yi, zi : Signal(width), in
+        Input values, signed.
+    xo, yo, zo : Signal(width), out
+        Output values, signed.
+    new_out : Signal(1), out
+        Asserted if output values are freshly updated in the current
+        cycle.
+    new_in : Signal(1), out
+        Asserted if new input values are being read in the next cycle.
+    zmax : float
+        `zi` and `zo` normalization factor. Floating point `zmax`
+        corresponds to `1<<(widthz - 1)`. `x` and `y` are scaled such
+        that floating point `1` corresponds to `1<<(width - 1)`.
+    gain : float
+        Cumulative, intrinsic gain and scaling factor. In circular mode
+        `sqrt(xi**2 + yi**2)` should be no larger than `2**(width - 1)/gain`
+        to prevent overflow. Additionally, in hyperbolic and linear mode,
+        the operation itself can cause overflow.
+    interval : int
+        Output interval in clock cycles. Inverse throughput.
+    latency : int
+        Input-to-output latency. The result corresponding to the inputs
+        appears at the outputs `latency` cycles later.
+
+    Notes
+    -----
+
+    Each stage `i` in the CORDIC performs the following operation:
+
+    .. math::
+        x_{i+1} = x_i - m d_i y_i r^{-s_{m,i}},
+
+        y_{i+1} = y_i + d_i x_i r^{-s_{m,i}},
+
+        z_{i+1} = z_i - d_i a_{m,i},
+
+    where:
+
+        * :math:`d_i`: clockwise or counterclockwise, determined by
+          `sign(z_i)` in rotate mode or `sign(-y_i)` in vector mode.
+
+        * :math:`r`: radix of the number system (2)
+
+        * :math:`m`: 1: circular, 0: linear, -1: hyperbolic
+
+        * :math:`s_{m,i}`: non decreasing integer shift sequence
+
+        * :math:`a_{m,i}`: elemetary rotation angle: :math:`a_{m,i} =
+          \\tan^{-1}(\\sqrt{m} s_{m,i})/\\sqrt{m}`.
+    """
+    def __init__(self, width=16, widthz=None, stages=None, guard=0,
+                 eval_mode="iterative", cordic_mode="rotate",
+                 func_mode="circular"):
+        # validate parameters
+        assert eval_mode in ("combinatorial", "pipelined", "iterative")
+        assert cordic_mode in ("rotate", "vector")
+        assert func_mode in ("circular", "linear", "hyperbolic")
+        self.cordic_mode = cordic_mode
+        self.func_mode = func_mode
+        if guard is None:
+            # guard bits to guarantee "width" accuracy
+            guard = int(log(width)/log(2))
+        if widthz is None:
+            widthz = width
+        if stages is None:
+            stages = width + min(1, guard)  # cuts error below LSB
+
+        # input output interface
+        self.xi = Signal((width, True))
+        self.yi = Signal((width, True))
+        self.zi = Signal((widthz, True))
+        self.xo = Signal((width, True))
+        self.yo = Signal((width, True))
+        self.zo = Signal((widthz, True))
+        self.new_in = Signal()
+        self.new_out = Signal()
+
+        ###
+
+        a, s, self.zmax, self.gain = self._constants(stages, widthz + guard)
+        stages = len(a)  # may have increased due to repetitions
+
+        if eval_mode == "iterative":
+            num_sig = 3
+            self.interval = stages + 1
+            self.latency = stages + 2
+        else:
+            num_sig = stages + 1
+            self.interval = 1
+            if eval_mode == "pipelined":
+                self.latency = stages
+            else:  # combinatorial
+                self.latency = 0
+
+        # inter-stage signals
+        x = [Signal((width + guard, True)) for i in range(num_sig)]
+        y = [Signal((width + guard, True)) for i in range(num_sig)]
+        z = [Signal((widthz + guard, True)) for i in range(num_sig)]
+
+        # hook up inputs and outputs to the first and last inter-stage
+        # signals
+        self.comb += [
+            x[0].eq(self.xi << guard),
+            y[0].eq(self.yi << guard),
+            z[0].eq(self.zi << guard),
+            self.xo.eq(x[-1] >> guard),
+            self.yo.eq(y[-1] >> guard),
+            self.zo.eq(z[-1] >> guard),
+            ]
+
+        if eval_mode == "iterative":
+            # We afford one additional iteration for in/out.
+            i = Signal(max=stages + 1)
+            self.comb += [
+                self.new_in.eq(i == stages),
+                self.new_out.eq(i == 1),
+            ]
+            ai = Signal((widthz + guard, True))
+            self.sync += ai.eq(Array(a)[i])
+            if range(stages) == s:
+                si = i - 1  # shortcut if no stage repetitions
+            else:
+                si = Signal(max=stages + 1)
+                self.sync += si.eq(Array(s)[i])
+            xi, yi, zi = x[1], y[1], z[1]
+            self.sync += [
+                self._stage(xi, yi, zi, xi, yi, zi, si, ai),
+                i.eq(i + 1),
+                If(i == stages,
+                   i.eq(0),
+                ),
+                If(i == 0,
+                   x[2].eq(xi), y[2].eq(yi), z[2].eq(zi),
+                   xi.eq(x[0]), yi.eq(y[0]), zi.eq(z[0]),
+                )
+            ]
+        else:
+            self.comb += [
+                self.new_out.eq(1),
+                self.new_in.eq(1),
+            ]
+            for i, si in enumerate(s):
+                stmt = self._stage(x[i], y[i], z[i],
+                                   x[i + 1], y[i + 1], z[i + 1],
+                                   si, a[i])
+                if eval_mode == "pipelined":
+                    self.sync += stmt
+                else:  # combinatorial
+                    self.comb += stmt
+
+    def _constants(self, stages, bits):
+        if self.func_mode == "circular":
+            s = range(stages)
+            a = [atan(2**-i) for i in s]
+            g = [sqrt(1 + 2**(-2*i)) for i in s]
+            #zmax = sum(a)
+            # use pi anyway as the input z can cause overflow
+            # and we need the range for quadrant mapping
+            zmax = pi
+        elif self.func_mode == "linear":
+            s = range(stages)
+            a = [2**-i for i in s]
+            g = [1 for i in s]
+            #zmax = sum(a)
+            # use 2 anyway as this simplifies a and scaling
+            zmax = 2.
+        else:  # hyperbolic
+            s = []
+            # need to repeat some stages:
+            j = 4
+            for i in range(stages):
+                if i == j:
+                    s.append(j)
+                    j = 3*j + 1
+                s.append(i + 1)
+            a = [atanh(2**-i) for i in s]
+            g = [sqrt(1 - 2**(-2*i)) for i in s]
+            zmax = sum(a)*2
+        # round here helps the width=2**i - 1 case but hurts the
+        # important width=2**i case
+        cast = int
+        if log(bits)/log(2) % 1:
+            cast = round
+        a = [cast(ai*2**(bits - 1)/zmax) for ai in a]
+        gain = 1.
+        for gi in g:
+            gain *= gi
+        return a, s, zmax, gain
+
+    def _stage(self, xi, yi, zi, xo, yo, zo, i, ai):
+        dir = Signal()
+        if self.cordic_mode == "rotate":
+            self.comb += dir.eq(zi < 0)
+        else:  # vector
+            self.comb += dir.eq(yi >= 0)
+        dx = yi >> i
+        dy = xi >> i
+        dz = ai
+        if self.func_mode == "linear":
+            dx = 0
+        elif self.func_mode == "hyperbolic":
+            dx = -dx
+        stmt = [
+            xo.eq(xi + Mux(dir, dx, -dx)),
+            yo.eq(yi + Mux(dir, -dy, dy)),
+            zo.eq(zi + Mux(dir, dz, -dz))
+        ]
+        return stmt
+
+
+class Cordic(TwoQuadrantCordic):
+    """Four-quadrant CORDIC
+
+    Same as :class:`TwoQuadrantCordic` but with support and convergence
+    for `abs(zi) > pi/2 in circular rotate mode or `xi < 0` in circular
+    vector mode.
+    """
+    def __init__(self, **kwargs):
+        TwoQuadrantCordic.__init__(self, **kwargs)
+        if self.func_mode != "circular":
+            return  # no need to remap quadrants
+
+        cxi, cyi, czi = self.xi, self.yi, self.zi
+        self.xi = xi = Signal.like(cxi)
+        self.yi = yi = Signal.like(cyi)
+        self.zi = zi = Signal.like(czi)
+
+        ###
+
+        q = Signal()
+        if self.cordic_mode == "rotate":
+            self.comb += q.eq(zi[-2] ^ zi[-1])
+        else:  # vector
+            self.comb += q.eq(xi < 0)
+        self.comb += [
+            If(q,
+                Cat(cxi, cyi, czi).eq(
+                    Cat(-xi, -yi, zi + (1 << len(zi) - 1)))
+            ).Else(
+                Cat(cxi, cyi, czi).eq(Cat(xi, yi, zi))
+            )
+        ]
diff --git a/artiq/gateware/dsp/phaser.py b/artiq/gateware/dsp/phaser.py
@@ -0,0 +1,44 @@
+from migen import *
+
+
+class Phaser(Module):
+    def __init__(self, factory, parallelism=8):
+        q = factory(step=1)
+        p = [factory(step=parallelism) for i in range(parallelism)]
+
+        self.i = q.i
+        self.o = [pi.o for pi in p]
+        self.ce = Signal()
+        self.busy = Signal()
+        self.parallelism = parallelism
+        self.latency = q.latency*parallelism + p[-1].latency
+
+        ###
+
+        self.submodules += q, p
+
+        n = Signal(max=parallelism)
+        shift = Signal()
+        self.comb += [
+            self.busy.eq(n != 0),
+            q.ce.eq(q.i.stb | self.busy),
+            p[-1].i.payload.eq(q.o.payload),
+            p[-1].i.stb.eq(shift & ~self.busy),
+            [pi.i.stb.eq(p[-1].i.stb) for pi in p[:-1]],
+            [pi.ce.eq(self.ce) for pi in p],
+        ]
+        self.sync += [
+            If(n == 0,
+                If(p[-1].i.ack,
+                    shift.eq(0),
+                ),
+            ).Elif(q.o.stb,
+                [p[i].i.payload.eq(p[i + 1].i.payload)
+                 for i in range(parallelism - 1)],
+                n.eq(n - 1),
+            ),
+            If(q.i.stb,
+                shift.eq(1),
+                n.eq(parallelism - 1),
+            ),
+        ]
diff --git a/artiq/gateware/dsp/sawg.py b/artiq/gateware/dsp/sawg.py
@@ -0,0 +1,286 @@
+from migen import *
+from misoc.interconnect.stream import Endpoint
+
+from .cordic import Cordic
+from .spline import Spline
+from .accu import PhasedAccu, Accu
+from .tools import Delay, eqh, SatAddMixin
+
+
+class DDSFast(Module):
+    def __init__(self, width, t_width=None,
+                 a_width=None, p_width=None, f_width=None,
+                 a_order=4, p_order=1, f_order=2, parallelism=8):
+        if t_width is None:
+            t_width = width
+        if a_width is None:
+            a_width = width + (a_order - 1)*t_width
+        if p_width is None:
+            p_width = width + (p_order - 1)*t_width
+        if f_width is None:
+            f_width = width + (f_order + 1)*t_width
+        a = Spline(order=a_order, width=a_width)
+        p = Spline(order=p_order, width=p_width)
+        f = Spline(order=f_order, width=f_width)
+        self.submodules += a, p, f
+
+        self.a = a.tri(t_width)
+        self.f = f.tri(t_width)
+        self.p = p.tri(t_width)
+        self.i = [self.a, self.f, self.p]
+        self.o = [[Signal((width, True)) for i in range(2)]
+                  for i in range(parallelism)]
+        self.ce = Signal()
+        self.clr = Signal()
+        self.parallelism = parallelism
+        self.latency = 0  # will be accumulated
+
+        ###
+
+        self.latency += p.latency
+        q = PhasedAccu(f_width, parallelism)
+        self.submodules += q
+        self.latency += q.latency
+        da = [Signal((width, True)) for i in range(q.latency)]
+
+        self.sync += [
+            If(q.i.stb & q.i.ack,
+                eqh(da[0], a.o.a0),
+                [da[i + 1].eq(da[i]) for i in range(len(da) - 1)],
+            ),
+            If(p.o.stb & p.o.ack,
+                q.i.clr.eq(0),
+            ),
+            If(p.i.stb & p.i.ack,
+                q.i.clr.eq(self.clr),
+            ),
+        ]
+        self.comb += [
+            a.o.ack.eq(self.ce),
+            p.o.ack.eq(self.ce),
+            f.o.ack.eq(self.ce),
+            q.i.stb.eq(self.ce),
+            eqh(q.i.p, p.o.a0),
+            q.i.f.eq(f.o.a0),
+            q.o.ack.eq(1),
+        ]
+
+        c = []
+        for i in range(parallelism):
+            ci = Cordic(width=width, widthz=p_width,
+                        guard=None, eval_mode="pipelined")
+            self.submodules += ci
+            c.append(ci)
+            qoi = getattr(q.o, "z{}".format(i))
+            self.comb += [
+                ci.xi.eq(da[-1]),
+                ci.yi.eq(0),
+                eqh(ci.zi, qoi),
+                eqh(self.o[i][0], ci.xo),
+                eqh(self.o[i][1], ci.yo),
+            ]
+        self.latency += c[0].latency
+        self.gain = c[0].gain
+
+
+class DDSSlow(Module):
+    def __init__(self, width, t_width, a_width, p_width, f_width,
+                 a_order=4, p_order=1, f_order=2):
+        a = Spline(order=a_order, width=a_width)
+        p = Spline(order=p_order, width=p_width)
+        f = Spline(order=f_order, width=f_width)
+        self.submodules += a, p, f
+
+        self.a = a.tri(t_width)
+        self.f = f.tri(t_width)
+        self.p = p.tri(t_width)
+        self.i = [self.a, self.f, self.p]
+        self.o = [Signal((width, True)) for i in range(2)]
+        self.ce = Signal()
+        self.clr = Signal()
+        self.latency = 0  # will be accumulated
+
+        ###
+
+        self.latency += p.latency
+        q = Accu(f_width)
+        self.latency += q.latency
+        da = CEInserter()(Delay)(width, q.latency)
+        c = Cordic(width=width, widthz=p_width,
+                   guard=None, eval_mode="pipelined")
+        self.latency += c.latency
+        self.gain = c.gain
+        self.submodules += q, da, c
+
+        self.sync += [
+            If(p.o.stb & p.o.ack,
+                q.i.clr.eq(0),
+            ),
+            If(p.i.stb & p.i.ack,
+                q.i.clr.eq(self.clr),
+            ),
+        ]
+        self.comb += [
+            da.ce.eq(q.i.stb & q.i.ack),
+            a.o.ack.eq(self.ce),
+            p.o.ack.eq(self.ce),
+            f.o.ack.eq(self.ce),
+            q.i.stb.eq(self.ce),
+            eqh(da.i, a.o.a0),
+            eqh(q.i.p, p.o.a0),
+            q.i.f.eq(f.o.a0),
+            q.o.ack.eq(1),
+            c.xi.eq(da.o),
+            c.yi.eq(0),
+            eqh(c.zi, q.o.z),
+            eqh(self.o[0], c.xo),
+            eqh(self.o[1], c.yo),
+        ]
+
+
+class DDS(Module, SatAddMixin):
+    def __init__(self, width, t_width=None,
+                 a_width=None, p_width=None, f_width=None,
+                 a_order=4, p_order=1, f_order=2, parallelism=8):
+        if t_width is None:
+            t_width = width
+        if a_width is None:
+            a_width = width + (a_order - 1)*t_width
+        if p_width is None:
+            p_width = width + (p_order - 1)*t_width
+        if f_width is None:
+            f_width = width + (f_order + 1)*t_width
+        self.b = [DDSSlow(width, t_width, a_width, p_width, f_width, a_order,
+                          p_order, f_order) for i in range(2)]
+        p = Spline(order=1, width=p_width)
+        f = Spline(order=1, width=f_width)
+        self.submodules += self.b, p, f
+
+        self.f = f.tri(t_width)
+        self.p = p.tri(t_width)
+        self.i = [self.f, self.p]
+        for i, bi in enumerate(self.b):
+            self.i += bi.i
+            for j in "afp":
+                setattr(self, "{}{}".format(j, i), getattr(bi, j))
+        self.o = [[Signal((width, True)) for i in range(2)]
+                  for i in range(parallelism)]
+        self.ce = Signal()
+        self.clr = Signal()
+        self.parallelism = parallelism
+        self.latency = 0  # will be accumulated
+
+        ###
+
+        self.latency += self.b[0].latency  # TODO: f0/p0, q.latency delta
+        q = PhasedAccu(f_width, parallelism)
+        self.submodules += q
+
+        self.sync += [
+            If(p.o.stb & p.o.ack,
+                q.i.clr.eq(0),
+            ),
+            If(p.i.stb & p.i.ack,
+                q.i.clr.eq(self.clr),
+            ),
+        ]
+        self.comb += [
+            [bi.ce.eq(self.ce) for bi in self.b],
+            [bi.clr.eq(self.clr) for bi in self.b],
+            p.o.ack.eq(self.ce),
+            f.o.ack.eq(self.ce),
+            q.i.stb.eq(self.ce),
+            eqh(q.i.p, p.o.a0),
+            eqh(q.i.f, f.o.a0),
+            q.o.ack.eq(1),
+        ]
+        x = self.sat_add(bi.o[0] for bi in self.b)
+        y = self.sat_add(bi.o[1] for bi in self.b)
+
+        c = []
+        for i in range(parallelism):
+            ci = Cordic(width=width, widthz=p_width,
+                        guard=None, eval_mode="pipelined")
+            self.submodules += ci
+            c.append(ci)
+            qoi = getattr(q.o, "z{}".format(i))
+            self.comb += [
+                ci.xi.eq(x),
+                ci.yi.eq(y),
+                eqh(ci.zi, qoi),
+                eqh(self.o[i][0], ci.xo),
+                eqh(self.o[i][1], ci.yo),
+            ]
+        self.latency += c[0].latency
+        self.gain = self.b[0].gain * c[0].gain
+
+
+class Config(Module):
+    def __init__(self):
+        self.cfg = Record([("tap", 5), ("clr", 1), ("iq", 2)])
+        self.i = Endpoint(self.cfg.layout)
+        self.ce = Signal()
+
+        ###
+
+        n = Signal(1 << len(self.i.tap))
+        tap = Signal.like(self.i.tap)
+        clk = Signal()
+        clk0 = Signal()
+
+        self.comb += [
+            self.i.ack.eq(1),
+            clk.eq(Array(n)[tap]),
+        ]
+        self.sync += [
+            clk0.eq(clk),
+            self.ce.eq(0),
+            If(clk0 ^ clk,
+                self.ce.eq(1),
+            ),
+            n.eq(n + 1),
+            If(self.i.stb,
+                n.eq(0),
+                self.cfg.eq(self.i.payload),
+            ),
+        ]
+
+
+class Channel(Module):
+    def __init__(self, width=16, t_width=None, u_order=4, **kwargs):
+        if t_width is None:
+            t_width = width
+        du = Spline(width=width + (u_order - 1)*t_width, order=u_order)
+        da = DDS(width, t_width, **kwargs)
+        cfg = Config()
+        self.submodules += du, da, cfg
+        self.i = [cfg.i, du.tri(t_width)] + da.i
+        self.q_i = [Signal((width, True)) for i in range(da.parallelism)]
+        self.q_o = [ai[1] for ai in da.o]
+        self.o = [Signal((width, True)) for i in range(da.parallelism)]
+        self.parallelism = da.parallelism
+        self.latency = da.latency + 1
+        self.cordic_gain = da.gain
+
+        ###
+
+        # delay du to match da
+        # ddu = Delay((width, True), da.latency - du.latency)
+        # self.submodules += ddu
+        self.comb += [
+        #     ddu.i.eq(du.o.a0[-width:]),
+            da.clr.eq(cfg.cfg.clr),
+            da.ce.eq(cfg.ce),
+            du.o.ack.eq(cfg.ce),
+        ]
+        # wire up outputs and q_{i,o} exchange
+        for oi, ai, qi in zip(self.o, da.o, self.q_i):
+            self.sync += [
+                oi.eq(du.o.a0[-width:] + # ddu.o +
+                      Mux(cfg.cfg.iq[0], ai[0], 0) +
+                      Mux(cfg.cfg.iq[1], qi, 0)),
+            ]
+
+    def connect_q(self, buddy):
+        for i, qi in enumerate(self.q_i):
+            self.comb += qi.eq(buddy.q_o[i])
diff --git a/artiq/gateware/dsp/spline.py b/artiq/gateware/dsp/spline.py
@@ -0,0 +1,46 @@
+from migen import *
+from misoc.interconnect.stream import Endpoint
+
+
+class Spline(Module):
+    def __init__(self, order, width, step=1, time_width=None):
+        if not (step == 1 or order <= 2):
+            raise ValueError("For non-linear splines, "
+                             "`step` needs to be one.")
+        layout = [("a{}".format(i), (width, True)) for i in range(order)]
+        self.i = Endpoint(layout)
+        self.o = Endpoint(layout)
+        self.latency = 1
+
+        ###
+
+        o = self.o.payload.flatten()
+
+        self.comb += self.i.ack.eq(~self.o.stb | self.o.ack)
+        self.sync += [
+            If(self.o.ack,
+                self.o.stb.eq(0),
+            ),
+            If(self.i.ack,
+                self.o.stb.eq(1),
+                [o[i].eq(o[i] + (o[i + 1] << log2_int(step)))
+                 for i in range(order - 1)],
+                If(self.i.stb,
+                    self.o.payload.eq(self.i.payload),
+                ),
+            ),
+        ]
+
+    def tri(self, time_width):
+        layout = [(name, (length - i*time_width, signed))
+                  for i, (name, (length, signed), dir) in
+                  enumerate(self.i.payload.layout[::-1])]
+        layout.reverse()
+        i = Endpoint(layout)
+        self.comb += [
+            self.i.stb.eq(i.stb),
+            i.ack.eq(self.i.ack),
+            [i0[-len(i1):].eq(i1) for i0, i1 in
+             zip(self.i.payload.flatten(), i.payload.flatten())]
+        ]
+        return i
diff --git a/artiq/gateware/dsp/test_accu.py b/artiq/gateware/dsp/test_accu.py
@@ -0,0 +1,47 @@
+import numpy as np
+
+from migen import *
+from migen.fhdl.verilog import convert
+
+from accu import Accu, PhasedAccu
+
+from tools import xfer
+
+
+def read(o, n):
+    p = []
+    for i in range(n):
+        p.append((yield from [(yield pi) for pi in o.payload.flatten()]))
+        yield
+    return p
+
+
+def _test_gen_accu(dut, o):
+    yield dut.o.ack.eq(1)
+    yield from xfer(dut, i=dict(p=0, f=1, clr=1))
+    o.extend((yield from read(dut.o, 8)))
+    yield from xfer(dut, i=dict(p=0, f=2, clr=0))
+    o.extend((yield from read(dut.o, 8)))
+    yield from xfer(dut, i=dict(p=0, f=2, clr=1))
+    o.extend((yield from read(dut.o, 8)))
+    yield from xfer(dut, i=dict(p=8, f=-1, clr=1))
+    o.extend((yield from read(dut.o, 8)))
+    yield from xfer(dut, i=dict(p=0, f=0, clr=1))
+    yield from xfer(dut, i=dict(p=1, f=0, clr=0))
+    o.extend((yield from read(dut.o, 8)))
+
+
+def _test_accu():
+    dut = PhasedAccu(8, parallelism=8)
+
+    if False:
+        print(convert(dut))
+    else:
+        o = []
+        run_simulation(dut, _test_gen_accu(dut, o), vcd_name="accu.vcd")
+        o = np.array(o)
+        print(o)
+
+
+if __name__ == "__main__":
+    _test_accu()
diff --git a/artiq/gateware/dsp/test_phaser.py b/artiq/gateware/dsp/test_phaser.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+from migen import *
+from migen.fhdl.verilog import convert
+
+from spline import Spline
+from phaser import Phaser
+
+
+def _test_gen_phaser(dut, o):
+    yield dut.ce.eq(1)
+    yield dut.i.a0.eq(0)
+    yield dut.i.a1.eq(1)
+    yield dut.i.stb.eq(1)
+    yield
+    while not (yield dut.i.ack):
+        yield
+    yield dut.i.stb.eq(0)
+    for i in range(len(dut.o)):
+        yield
+    for i in range(20):
+        yield
+        o.append((yield from [(yield pi.a0) for pi in dut.o]))
+
+
+def _test_phaser():
+    def f(step):
+        return Spline(order=2, width=16, step=step)
+    dut = Phaser(f, parallelism=2)
+
+    if False:
+        print(convert(dut))
+    else:
+        o = []
+        run_simulation(dut, _test_gen_phaser(dut, o), vcd_name="phaser.vcd")
+        o = np.array(o)
+        print(o)
+
+
+if __name__ == "__main__":
+    _test_phaser()
diff --git a/artiq/gateware/dsp/test_sawg.py b/artiq/gateware/dsp/test_sawg.py
@@ -0,0 +1,39 @@
+import numpy as np
+
+from migen import *
+from migen.fhdl.verilog import convert
+
+from sawg import DDS
+
+from tools import xfer
+
+
+def _test_gen_dds(dut, o):
+    yield dut.ce.eq(1)
+    yield dut.clr.eq(1)
+    yield from xfer(dut,
+                    a1=dict(a0=10),
+                    p1=dict(a0=0),
+                    f1=dict(a0=0 << 16, a1=0),
+                    f=dict(a0=10 << 24),
+                    p=dict(a0=0),
+                    )
+    for i in range(256):
+        yield
+        o.append((yield from [((yield _[0]), (yield _[1])) for _ in dut.o]))
+
+
+def _test_channel():
+    dut = DDS(width=8, parallelism=2)
+
+    if False:
+        print(convert(dut))
+    else:
+        o = []
+        run_simulation(dut, _test_gen_dds(dut, o), vcd_name="dds.vcd")
+        o = np.array(o)
+        print(o[:, :, 0])
+
+
+if __name__ == "__main__":
+    _test_channel()
diff --git a/artiq/gateware/dsp/test_spline.py b/artiq/gateware/dsp/test_spline.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+from migen import *
+from migen.fhdl.verilog import convert
+
+from spline import Spline
+from tools import xfer
+
+
+def _test_gen_spline(dut, o):
+    yield dut.o.ack.eq(1)
+    yield from xfer(dut, i=dict(a0=0, a1=1, a2=2))
+    for i in range(20):
+        yield
+        o.append((yield dut.o.a0))
+
+
+def _test_spline():
+    dut = Spline(order=3, width=16, step=1)
+
+    if False:
+        print(convert(dut))
+    else:
+        o = []
+        run_simulation(dut, _test_gen_spline(dut, o), vcd_name="spline.vcd")
+        o = np.array(o)
+        print(o)
+
+
+if __name__ == "__main__":
+    _test_spline()
diff --git a/artiq/gateware/dsp/tools.py b/artiq/gateware/dsp/tools.py
@@ -0,0 +1,77 @@
+from operator import add
+from functools import reduce
+
+from migen import *
+
+
+def set_dict(e, **k):
+    for k, v in k.items():
+        if isinstance(v, dict):
+            yield from set_dict(getattr(e, k), **v)
+        else:
+            yield getattr(e, k).eq(v)
+
+
+def xfer(dut, **kw):
+    ep = []
+    for e, v in kw.items():
+        e = getattr(dut, e)
+        yield from set_dict(e, **v)
+        ep.append(e)
+    for e in ep:
+        yield e.stb.eq(1)
+    while ep:
+        yield
+        for e in ep[:]:
+            if hasattr(e, "busy") and (yield e.busy):
+                raise ValueError(e, "busy")
+            if not hasattr(e, "ack") or (yield e.ack):
+                yield e.stb.eq(0)
+                ep.remove(e)
+
+
+class Delay(Module):
+    def __init__(self, i, delay, o=None):
+        if isinstance(i, (int, tuple)):
+            z = [Signal(i) for j in range(delay + 1)]
+        elif isinstance(i, list):
+            z = [Record(i) for j in range(delay + 1)]
+        elif isinstance(i, Record):
+            z = [Record(i.layout) for j in range(delay + 1)]
+        else:
+            z = [Signal.like(i) for j in range(delay + 1)]
+        self.i = z[0]
+        self.o = z[-1]
+        if not isinstance(i, (int, list, tuple)):
+            self.comb += self.i.eq(i)
+        if o is not None:
+            self.comb += o.eq(self.o)
+        self.latency = delay
+        self.sync += [z[j + 1].eq(z[j]) for j in range(delay)]
+
+
+def eqh(a, b):
+    return a[-len(b):].eq(b[-len(a):])
+
+
+class SatAddMixin:
+    def sat_add(self, a):
+        a = list(a)
+        # assert all(len(a[0]) == len(ai) for ai in a)
+        # assert all(a[0].signed == ai.signed for ai in a)
+        n = max(len(ai) for ai in a)
+        o = log2_int(len(a))
+        s = Signal((n + o, True))
+        self.comb += s.eq(reduce(add, a))
+        if len(a) == 1:
+            return s
+        else:
+            s0 = Signal((n, True))
+            self.comb += [
+                If(s[-o:] == Replicate(s[-o-1], o),
+                    s0.eq(s[:n]),
+                ).Else(
+                    s0.eq(Cat(Replicate(~s[-1], n - 1), s[-1])),
+                )
+            ]
+            return s0
diff --git a/artiq/gateware/rtio/phy/sawg.py b/artiq/gateware/rtio/phy/sawg.py
@@ -0,0 +1,29 @@
+from collections import namedtuple
+
+from migen import *
+from artiq.gateware.rtio import rtlink
+
+from artiq.gateware.dsp import sawg
+
+
+_Phy = namedtuple("Phy", "rtlink probes overrides")
+
+
+class Channel(Module):
+    def __init__(self, *args, **kwargs):
+        self.submodules._ll = ClockDomainsRenamer("rio_phy")(
+            sawg.Channel(*args, **kwargs))
+        self.phys = []
+        for i in self._ll.i:
+            rl = rtlink.Interface(rtlink.OInterface(
+                min(64, len(i.payload))))
+            self.comb += [
+                i.stb.eq(rl.o.stb),
+                rl.o.busy.eq(~i.ack),
+                Cat(i.payload.flatten()).eq(rl.o.data),
+            ]
+            # TODO: probes, overrides
+            self.phys.append(_Phy(rl, [], []))
+
+    def connect_q(self, other):
+        return self._ll.connect_q(other._ll)
diff --git a/artiq/gateware/targets/kc705.py b/artiq/gateware/targets/kc705.py
@@ -19,7 +19,8 @@
 
 from artiq.gateware.soc import AMPSoC, build_artiq_soc
 from artiq.gateware import rtio, nist_qc1, nist_clock, nist_qc2
-from artiq.gateware.rtio.phy import ttl_simple, ttl_serdes_7series, dds, spi
+from artiq.gateware.rtio.phy import (ttl_simple, ttl_serdes_7series,
+                                     dds, spi, sawg)
 from artiq import __version__ as artiq_version
 
 
@@ -388,6 +389,65 @@ def __init__(self, cpu_type="or1k", **kwargs):
         self.config["DDS_RTIO_CLK_RATIO"] = 24 >> self.rtio.fine_ts_width
 
 
+class Phaser(_NIST_Ions):
+    def __init__(self, cpu_type="or1k", **kwargs):
+        _NIST_Ions.__init__(self, cpu_type, **kwargs)
+
+        platform = self.platform
+        # TODO: dummy
+        platform.add_extension(nist_clock.fmc_adapter_io)
+
+        rtio_channels = []
+
+        phy = ttl_serdes_7series.Inout_8X(
+            platform.request("user_sma_gpio_n_33"))
+        self.submodules += phy
+        rtio_channels.append(rtio.Channel.from_phy(phy, ififo_depth=128))
+
+        phy = ttl_simple.Output(platform.request("user_led", 2))
+        self.submodules += phy
+        rtio_channels.append(rtio.Channel.from_phy(phy))
+
+        self.config["RTIO_REGULAR_TTL_COUNT"] = len(rtio_channels)
+
+        self.config["RTIO_FIRST_PHASER_CHANNEL"] = len(rtio_channels)
+
+        sawgs = [sawg.Channel(width=16, parallelism=4,
+                                   a_order=1, p_order=1, f_order=1)
+                 for i in range(4)]
+        self.submodules += sawgs
+        for i in range(0, len(sawgs), 2):
+            sawgs[i].connect_q(sawgs[i + 1])
+            sawgs[i + 1].connect_q(sawgs[i])
+
+        # TODO: dummy
+        o = Signal((16, True))
+        for ch in sawgs:
+            for oi in ch._ll.o:
+                o0, o = o, Signal.like(o)
+                self.sync += o.eq(o0 + oi)
+        self.sync.rio_phy += platform.request("dds").d.eq(o)
+
+        # TODO: support wider RTIO (data) channels
+        # (64 bit is fine here for testing)
+        rtio_channels.extend(rtio.Channel.from_phy(phy)
+                             for sawg in sawgs
+                             for phy in sawg.phys)
+
+        self.config["RTIO_LOG_CHANNEL"] = len(rtio_channels)
+        rtio_channels.append(rtio.LogChannel())
+        self.add_rtio(rtio_channels)
+
+        self.config["RTIO_FIRST_DDS_CHANNEL"] = len(rtio_channels)
+        self.config["RTIO_DDS_COUNT"] = 0
+        self.config["DDS_CHANNELS_PER_BUS"] = 1
+        self.config["DDS_AD9914"] = True
+        self.config["DDS_ONEHOT_SEL"] = True
+        self.config["DDS_RTIO_CLK_RATIO"] = 24 >> self.rtio.fine_ts_width
+        assert self.rtio.fine_ts_width <= 3
+
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="ARTIQ core device builder / KC705 "
@@ -407,6 +467,8 @@ def main():
         cls = NIST_CLOCK
     elif hw_adapter == "nist_qc2":
         cls = NIST_QC2
+    elif hw_adapter == "phaser":
+        cls = Phaser
     else:
         raise SystemExit("Invalid hardware adapter string (-H/--hw-adapter)")