nics are starting to offer the ability to timestamp packets when
they're received. other systems (eg linux and freebsd) have support
for recording that timestamp on mbufs and then using it as the
backend for at least the SO_TIMESTAMP socket option instead of a
call to microtime().

this implements the above, and additionally supports using the hw
timestamp in bpf too. other systems may do the bpf thing too, but i
didn't look closely enough to find out.

timestamps are recorded as the uptime of the system in nanoseconds
in the ph_timestamp field in mbufs. this mirrors the use of
ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
im using another bit in m_pkthdr.csum_flags to say whether the timestamp
is valid or not (M_TIMESTAMP). im arguing that it's another offloading
feature and therefore appropriate for the csum offload flags field.

this adds some inline functions to time.h for turning ns into a timeval
and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
i originally wanted ns_to_timeval and ns_to_timespec, but the linux
compat stuff in drm already uses those names and ruined the idea.
especially since they return the time{val,spec}s as values.

the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
and if so turns ph_timestamp into a timeval before adding it to boottime
(which is the wall clock time that uptime starts at), before using it
instead of microtime().

the mcx changes are based on what freebsd did to their driver, but
simplified a bit.

i want this because we're being asked to look at recording network
traffic for possible audit use. part of that is having accurate
timestamps on received packets, and hopefully it will mitigate against
chunks of packets getting reordered or delayed significantly when the
box is busy.

thoughts? ok?

Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.242
diff -u -p -r1.242 mbuf.h
--- sys/mbuf.h  11 Feb 2019 00:25:33 -0000      1.242
+++ sys/mbuf.h  7 Jun 2019 03:27:41 -0000
@@ -226,13 +226,14 @@ struct mbuf {
 #define        M_ICMP_CSUM_IN_OK       0x0400  /* ICMP/ICMPv6 checksum 
verified */
 #define        M_ICMP_CSUM_IN_BAD      0x0800  /* ICMP/ICMPv6 checksum bad */
 #define        M_IPV6_DF_OUT           0x1000  /* don't fragment outgoing IPv6 
*/
+#define        M_TIMESTAMP             0x2000  /* ph_timestamp is set */
 
 #ifdef _KERNEL
 #define MCS_BITS \
     ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
     "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
     "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
-    "\15IPV6_NODF_OUT")
+    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
 #endif
 
 /* mbuf types */
Index: sys/time.h
===================================================================
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.41
diff -u -p -r1.41 time.h
--- sys/time.h  3 Jun 2019 01:27:30 -0000       1.41
+++ sys/time.h  7 Jun 2019 03:27:41 -0000
@@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct
 /* Traditional POSIX base year */
 #define POSIX_BASE_YEAR 1970
 
+static __inline void
+ns_to_microtime(struct timeval *tv, uint64_t ns)
+{
+       tv->tv_sec = ns / 1000000000L;
+       tv->tv_usec = (ns % 1000000000L) / 1000;
+}
+
+static __inline void
+ns_to_nanotime(struct timespec *tv, uint64_t ns)
+{
+       tv->tv_sec = ns / 1000000000L;
+       tv->tv_nsec = ns % 1000000000L;
+}
+
 #else /* !_KERNEL */
 #include <time.h>
 
Index: net/bpf.c
===================================================================
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.175
diff -u -p -r1.175 bpf.c
--- net/bpf.c   18 May 2019 12:59:32 -0000      1.175
+++ net/bpf.c   7 Jun 2019 03:27:41 -0000
@@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf
                        fcode = bps->bps_bf.bf_insns;
                slen = bpf_mfilter(fcode, m, pktlen);
 
-               if (slen == 0)
+               if (slen == 0)
                        continue;
                if (d->bd_fildrop != BPF_FILDROP_PASS)
                        drop = 1;
                if (d->bd_fildrop != BPF_FILDROP_DROP) {
-                       if (!gottime++)
-                               microtime(&tv);
+                       if (!gottime) {
+                               if (ISSET(m->m_flags, M_PKTHDR) &&
+                                   ISSET(m->m_pkthdr.csum_flags,
+                                    M_TIMESTAMP)) {
+                                       struct timeval btv;
+                                       ns_to_microtime(&tv,
+                                           m->m_pkthdr.ph_timestamp);
+                                       microboottime(&btv);
+                                       timeradd(&tv, &btv, &tv);
+                               } else
+                                       microtime(&tv);
+
+                               gottime = 1;
+                       }
 
                        mtx_enter(&d->bd_mtx);
                        bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.342
diff -u -p -r1.342 ip_input.c
--- netinet/ip_input.c  13 Oct 2018 18:36:01 -0000      1.342
+++ netinet/ip_input.c  7 Jun 2019 03:27:41 -0000
@@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct
        if (inp->inp_socket->so_options & SO_TIMESTAMP) {
                struct timeval tv;
 
-               microtime(&tv);
+               if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) {
+                       struct timeval btv;
+                       ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp);
+                       microboottime(&btv);
+                       timeradd(&tv, &btv, &tv);
+               } else
+                       microtime(&tv);
+
                *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
                    SCM_TIMESTAMP, SOL_SOCKET);
                if (*mp)
Index: dev/pci/if_mcx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.16
diff -u -p -r1.16 if_mcx.c
--- dev/pci/if_mcx.c    4 Jun 2019 05:29:30 -0000       1.16
+++ dev/pci/if_mcx.c    7 Jun 2019 03:27:41 -0000
@@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out {
 } __packed __aligned(4);
 
 struct mcx_cq_entry {
-       uint32_t                cq_reserved1;
+       uint32_t                __reserved__;
        uint32_t                cq_lro;
        uint32_t                cq_lro_ack_seq_num;
        uint32_t                cq_rx_hash;
-       uint32_t                cq_rx_hash_type;
+       uint8_t                 cq_rx_hash_type;
+       uint8_t                 cq_ml_path;
+       uint16_t                __reserved__;
        uint32_t                cq_checksum;
-       uint32_t                cq_reserved2;
+       uint32_t                __reserved__;
        uint32_t                cq_flags;
        uint32_t                cq_lro_srqn;
-       uint32_t                cq_reserved3[2];
+       uint32_t                __reserved__[2];
        uint32_t                cq_byte_cnt;
-       uint32_t                cq_lro_ts_value;
-       uint32_t                cq_lro_ts_echo;
-       uint32_t                cq_flow_tag;
+       uint64_t                cq_timestamp;
+#define MCX_CQ_ENTRY_TIMESTAMP_PTP     (1 << 63)
+       uint32_t                cq_rx_drops;
        uint16_t                cq_wqe_count;
        uint8_t                 cq_signature;
        uint8_t                 cq_opcode_owner;
@@ -1895,6 +1899,18 @@ struct mcx_cq {
        int                      cq_count;
 };
 
+struct mcx_calibration {
+       uint64_t                 c_timestamp;   /* previous mcx chip time */
+       uint64_t                 c_uptime;      /* previous kernel nanouptime */
+       uint64_t                 c_tbase;       /* mcx chip time */
+       uint64_t                 c_ubase;       /* kernel nanouptime */
+       uint64_t                 c_tdiff;
+       uint64_t                 c_udiff;
+};
+
+#define MCX_CALIBRATE_FIRST    2
+#define MCX_CALIBRATE_NORMAL   30
+
 struct mcx_softc {
        struct device            sc_dev;
        struct arpcom            sc_ac;
@@ -1948,6 +1964,11 @@ struct mcx_softc {
        int                      sc_extra_mcast;
        uint8_t                  
sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
 
+       struct mcx_calibration   sc_calibration[2];
+       unsigned int             sc_calibration_gen;
+       uint32_t                 sc_freq;
+       struct timeout           sc_calibrate;
+
        struct mcx_cq            sc_cq[MCX_MAX_CQS];
        int                      sc_num_cq;
 
@@ -2039,8 +2060,8 @@ static void       mcx_cmdq_dump(const struct m
 static void    mcx_cmdq_mbox_dump(struct mcx_dmamem *, int);
 */
 static void    mcx_refill(void *);
-static void    mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
-                   struct mbuf_list *, int *);
+static int     mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
+                   struct mbuf_list *, const struct mcx_calibration *);
 static void    mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
                    int *);
 static void    mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
@@ -2060,6 +2081,9 @@ static void       mcx_media_status(struct ifne
 static int     mcx_media_change(struct ifnet *);
 static int     mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
 
+static void    mcx_calibrate_first(struct mcx_softc *);
+static void    mcx_calibrate(void *);
+
 static inline uint32_t
                mcx_rd(struct mcx_softc *, bus_size_t);
 static inline void
@@ -2067,6 +2091,8 @@ static inline void
 static inline void
                mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
 
+static uint64_t        mcx_timer(struct mcx_softc *);
+
 static int     mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
                    bus_size_t, u_int align);
 static void    mcx_dmamem_zero(struct mcx_dmamem *);
@@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct
        ether_ifattach(ifp);
 
        timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
+       timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
 
        sc->sc_flow_table_id = -1;
        for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
@@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc, 
        ms->ms_m = NULL;
 }
 
-void
+static uint64_t
+mcx_uptime(void)
+{
+       struct timespec ts;
+
+       nanouptime(&ts);
+
+       return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
+}
+
+static void
+mcx_calibrate_first(struct mcx_softc *sc)
+{
+       struct mcx_calibration *c = &sc->sc_calibration[0];
+
+       sc->sc_calibration_gen = 0;
+
+       c->c_ubase = mcx_uptime();
+       c->c_tbase = mcx_timer(sc);
+       c->c_tdiff = 0;
+
+       timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);
+}
+
+#define MCX_TIMESTAMP_SHIFT 10
+
+static void
+mcx_calibrate(void *arg)
+{
+       struct mcx_softc *sc = arg;
+       struct mcx_calibration *nc, *pc;
+       unsigned int gen;
+
+       if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
+               return;
+
+       timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
+
+       gen = sc->sc_calibration_gen;
+       pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+       gen++;
+       nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
+       nc->c_uptime = pc->c_ubase;
+       nc->c_timestamp = pc->c_tbase;
+
+       nc->c_ubase = mcx_uptime();
+       nc->c_tbase = mcx_timer(sc);
+
+       nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
+       nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
+
+       membar_producer();
+       sc->sc_calibration_gen = gen;
+}
+
+static int
 mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
-    struct mbuf_list *ml, int *slots)
+    struct mbuf_list *ml, const struct mcx_calibration *c)
 {
        struct mcx_slot *ms;
        struct mbuf *m;
@@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
 
        m = ms->ms_m;
        ms->ms_m = NULL;
-       m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
-       (*slots)++;
+
+       m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
+
+       if (cqe->cq_rx_hash_type) {
+               m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
+                   bemtoh32(&cqe->cq_rx_hash);
+       }
+
+       if (c->c_tdiff) {
+               uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
+               t *= c->c_udiff;
+               t /= c->c_tdiff;
+
+               m->m_pkthdr.ph_timestamp = c->c_uptime + t;
+               SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
+       }
 
        ml_enqueue(ml, m);
+
+       return (1);
 }
 
 static struct mcx_cq_entry *
@@ -5624,11 +5729,17 @@ void
 mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
 {
        struct ifnet *ifp = &sc->sc_ac.ac_if;
+       const struct mcx_calibration *c;
+       unsigned int gen;
        struct mcx_cq_entry *cqe;
        uint8_t *cqp;
        struct mbuf_list ml = MBUF_LIST_INITIALIZER();
        int rxfree, txfree;
 
+       gen = sc->sc_calibration_gen;
+       membar_consumer();
+       c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
+
        rxfree = 0;
        txfree = 0;
        while ((cqe = mcx_next_cq_entry(sc, cq))) {
@@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str
                        mcx_process_txeof(sc, cqe, &txfree);
                        break;
                case MCX_CQ_ENTRY_OPCODE_SEND:
-                       mcx_process_rx(sc, cqe, &ml, &rxfree);
+                       rxfree += mcx_process_rx(sc, cqe, &ml, c);
                        break;
                case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
                case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
@@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc)
        sc->sc_rx_prod = 0;
        mcx_rx_fill(sc);
 
+       mcx_calibrate_first(sc);
+
        SET(ifp->if_flags, IFF_RUNNING);
 
        sc->sc_tx_cons = 0;
@@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc)
        intr_barrier(&sc->sc_ih);
        ifq_barrier(&ifp->if_snd);
 
+       timeout_del_barrier(&sc->sc_calibrate);
+
        for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
                if (sc->sc_flow_group_id[group] != -1)
                        mcx_destroy_flow_group(sc,
@@ -6435,6 +6550,26 @@ static inline void
 mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
 {
        bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
+}
+
+static uint64_t
+mcx_timer(struct mcx_softc *sc)
+{
+       uint32_t hi, lo, ni;
+
+       hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+       for (;;) {
+               lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
+               mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
+               ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
+
+               if (ni == hi)
+                       break;
+
+               hi = ni;
+       }
+
+       return (((uint64_t)hi << 32) | (uint64_t)lo);
 }
 
 static int

Reply via email to