nics are starting to offer the ability to timestamp packets when they're received. other systems (eg linux and freebsd) have support for recording that timestamp on mbufs and then using it as the backend for at least the SO_TIMESTAMP socket option instead of a call to microtime().
this implements the above, and additionally supports using the hw timestamp in bpf too. other systems may do the bpf thing too, but i didn't look closely enough to find out. timestamps are recorded as the uptime of the system in nanoseconds in the ph_timestamp field in mbufs. this mirrors the use of ph_timestamp in the fq_codel code to store the uptime in nanoseconds. im using another bit in m_pkthdr.csum_flags to say whether the timestamp is valid or not (M_TIMESTAMP). im arguing that it's another offloading feature and therefore appropriate for the csum offload flags field. this adds some inline functions to time.h for turning ns into a timeval and timespec, which are ns_to_microtime and ns_to_nanotime respectively. i originally wanted ns_to_timeval and ns_to_timespec, but the linux compat stuff in drm already uses those names and ruined the idea. especially since they return the time{val,spec}s as values. the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set, and if so turns ph_timestamp into a timeval before adding it to boottime (which is the wall clock time that uptime starts at), before using it instead of microtime(). the mcx changes are based on what freebsd did to their driver, but simplified a bit. i want this because we're being asked to look at recording network traffic for possible audit use. part of that is having accurate timestamps on received packets, and hopefully it will mitigate against chunks of packets getting reordered or delayed significantly when the box is busy. thoughts? ok? Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v retrieving revision 1.242 diff -u -p -r1.242 mbuf.h --- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242 +++ sys/mbuf.h 7 Jun 2019 03:27:41 -0000 @@ -226,13 +226,14 @@ struct mbuf { #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum verified */ #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */ #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */ +#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */ #ifdef _KERNEL #define MCS_BITS \ ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \ "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \ "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \ - "\15IPV6_NODF_OUT") + "\15IPV6_NODF_OUT" "\16TIMESTAMP") #endif /* mbuf types */ Index: sys/time.h =================================================================== RCS file: /cvs/src/sys/sys/time.h,v retrieving revision 1.41 diff -u -p -r1.41 time.h --- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41 +++ sys/time.h 7 Jun 2019 03:27:41 -0000 @@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct /* Traditional POSIX base year */ #define POSIX_BASE_YEAR 1970 +static __inline void +ns_to_microtime(struct timeval *tv, uint64_t ns) +{ + tv->tv_sec = ns / 1000000000L; + tv->tv_usec = (ns % 1000000000L) / 1000; +} + +static __inline void +ns_to_nanotime(struct timespec *tv, uint64_t ns) +{ + tv->tv_sec = ns / 1000000000L; + tv->tv_nsec = ns % 1000000000L; +} + #else /* !_KERNEL */ #include <time.h> Index: net/bpf.c =================================================================== RCS file: /cvs/src/sys/net/bpf.c,v retrieving revision 1.175 diff -u -p -r1.175 bpf.c --- net/bpf.c 18 May 2019 12:59:32 -0000 1.175 +++ net/bpf.c 7 Jun 2019 03:27:41 -0000 @@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf fcode = bps->bps_bf.bf_insns; slen = bpf_mfilter(fcode, m, pktlen); - if (slen == 0) + if (slen == 0) continue; if (d->bd_fildrop != BPF_FILDROP_PASS) drop = 1; if (d->bd_fildrop != BPF_FILDROP_DROP) { - if (!gottime++) - microtime(&tv); + if (!gottime) { + if (ISSET(m->m_flags, M_PKTHDR) && + ISSET(m->m_pkthdr.csum_flags, + M_TIMESTAMP)) { + struct timeval btv; + ns_to_microtime(&tv, + m->m_pkthdr.ph_timestamp); + microboottime(&btv); + timeradd(&tv, &btv, &tv); + } else + microtime(&tv); + + gottime = 1; + } mtx_enter(&d->bd_mtx); bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn, Index: netinet/ip_input.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_input.c,v retrieving revision 1.342 diff -u -p -r1.342 ip_input.c --- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342 +++ netinet/ip_input.c 7 Jun 2019 03:27:41 -0000 @@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; - microtime(&tv); + if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) { + struct timeval btv; + ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp); + microboottime(&btv); + timeradd(&tv, &btv, &tv); + } else + microtime(&tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) Index: dev/pci/if_mcx.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v retrieving revision 1.16 diff -u -p -r1.16 if_mcx.c --- dev/pci/if_mcx.c 4 Jun 2019 05:29:30 -0000 1.16 +++ dev/pci/if_mcx.c 7 Jun 2019 03:27:41 -0000 @@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out { } __packed __aligned(4); struct mcx_cq_entry { - uint32_t cq_reserved1; + uint32_t __reserved__; uint32_t cq_lro; uint32_t cq_lro_ack_seq_num; uint32_t cq_rx_hash; - uint32_t cq_rx_hash_type; + uint8_t cq_rx_hash_type; + uint8_t cq_ml_path; + uint16_t __reserved__; uint32_t cq_checksum; - uint32_t cq_reserved2; + uint32_t __reserved__; uint32_t cq_flags; uint32_t cq_lro_srqn; - uint32_t cq_reserved3[2]; + uint32_t __reserved__[2]; uint32_t cq_byte_cnt; - uint32_t cq_lro_ts_value; - uint32_t cq_lro_ts_echo; - uint32_t cq_flow_tag; + uint64_t cq_timestamp; +#define MCX_CQ_ENTRY_TIMESTAMP_PTP (1 << 63) + uint32_t cq_rx_drops; uint16_t cq_wqe_count; uint8_t cq_signature; uint8_t cq_opcode_owner; @@ -1895,6 +1899,18 @@ struct mcx_cq { int cq_count; }; +struct mcx_calibration { + uint64_t c_timestamp; /* previous mcx chip time */ + uint64_t c_uptime; /* previous kernel nanouptime */ + uint64_t c_tbase; /* mcx chip time */ + uint64_t c_ubase; /* kernel nanouptime */ + uint64_t c_tdiff; + uint64_t c_udiff; +}; + +#define MCX_CALIBRATE_FIRST 2 +#define MCX_CALIBRATE_NORMAL 30 + struct mcx_softc { struct device sc_dev; struct arpcom sc_ac; @@ -1948,6 +1964,11 @@ struct mcx_softc { int sc_extra_mcast; uint8_t sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN]; + struct mcx_calibration sc_calibration[2]; + unsigned int sc_calibration_gen; + uint32_t sc_freq; + struct timeout sc_calibrate; + struct mcx_cq sc_cq[MCX_MAX_CQS]; int sc_num_cq; @@ -2039,8 +2060,8 @@ static void mcx_cmdq_dump(const struct m static void mcx_cmdq_mbox_dump(struct mcx_dmamem *, int); */ static void mcx_refill(void *); -static void mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *, - struct mbuf_list *, int *); +static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *, + struct mbuf_list *, const struct mcx_calibration *); static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *, int *); static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *); @@ -2060,6 +2081,9 @@ static void mcx_media_status(struct ifne static int mcx_media_change(struct ifnet *); static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *); +static void mcx_calibrate_first(struct mcx_softc *); +static void mcx_calibrate(void *); + static inline uint32_t mcx_rd(struct mcx_softc *, bus_size_t); static inline void @@ -2067,6 +2091,8 @@ static inline void static inline void mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int); +static uint64_t mcx_timer(struct mcx_softc *); + static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *, bus_size_t, u_int align); static void mcx_dmamem_zero(struct mcx_dmamem *); @@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct ether_ifattach(ifp); timeout_set(&sc->sc_rx_refill, mcx_refill, sc); + timeout_set(&sc->sc_calibrate, mcx_calibrate, sc); sc->sc_flow_table_id = -1; for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) { @@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc, ms->ms_m = NULL; } -void +static uint64_t +mcx_uptime(void) +{ + struct timespec ts; + + nanouptime(&ts); + + return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec); +} + +static void +mcx_calibrate_first(struct mcx_softc *sc) +{ + struct mcx_calibration *c = &sc->sc_calibration[0]; + + sc->sc_calibration_gen = 0; + + c->c_ubase = mcx_uptime(); + c->c_tbase = mcx_timer(sc); + c->c_tdiff = 0; + + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST); +} + +#define MCX_TIMESTAMP_SHIFT 10 + +static void +mcx_calibrate(void *arg) +{ + struct mcx_softc *sc = arg; + struct mcx_calibration *nc, *pc; + unsigned int gen; + + if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING)) + return; + + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL); + + gen = sc->sc_calibration_gen; + pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; + gen++; + nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; + + nc->c_uptime = pc->c_ubase; + nc->c_timestamp = pc->c_tbase; + + nc->c_ubase = mcx_uptime(); + nc->c_tbase = mcx_timer(sc); + + nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT; + nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT; + + membar_producer(); + sc->sc_calibration_gen = gen; +} + +static int mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe, - struct mbuf_list *ml, int *slots) + struct mbuf_list *ml, const struct mcx_calibration *c) { struct mcx_slot *ms; struct mbuf *m; @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str m = ms->ms_m; ms->ms_m = NULL; - m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt); - (*slots)++; + + m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt); + + if (cqe->cq_rx_hash_type) { + m->m_pkthdr.ph_flowid = M_FLOWID_VALID | + bemtoh32(&cqe->cq_rx_hash); + } + + if (c->c_tdiff) { + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp; + t *= c->c_udiff; + t /= c->c_tdiff; + + m->m_pkthdr.ph_timestamp = c->c_uptime + t; + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP); + } ml_enqueue(ml, m); + + return (1); } static struct mcx_cq_entry * @@ -5624,11 +5729,17 @@ void mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq) { struct ifnet *ifp = &sc->sc_ac.ac_if; + const struct mcx_calibration *c; + unsigned int gen; struct mcx_cq_entry *cqe; uint8_t *cqp; struct mbuf_list ml = MBUF_LIST_INITIALIZER(); int rxfree, txfree; + gen = sc->sc_calibration_gen; + membar_consumer(); + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; + rxfree = 0; txfree = 0; while ((cqe = mcx_next_cq_entry(sc, cq))) { @@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str mcx_process_txeof(sc, cqe, &txfree); break; case MCX_CQ_ENTRY_OPCODE_SEND: - mcx_process_rx(sc, cqe, &ml, &rxfree); + rxfree += mcx_process_rx(sc, cqe, &ml, c); break; case MCX_CQ_ENTRY_OPCODE_REQ_ERR: case MCX_CQ_ENTRY_OPCODE_SEND_ERR: @@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc) sc->sc_rx_prod = 0; mcx_rx_fill(sc); + mcx_calibrate_first(sc); + SET(ifp->if_flags, IFF_RUNNING); sc->sc_tx_cons = 0; @@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc) intr_barrier(&sc->sc_ih); ifq_barrier(&ifp->if_snd); + timeout_del_barrier(&sc->sc_calibrate); + for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) { if (sc->sc_flow_group_id[group] != -1) mcx_destroy_flow_group(sc, @@ -6435,6 +6550,26 @@ static inline void mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f) { bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f); +} + +static uint64_t +mcx_timer(struct mcx_softc *sc) +{ + uint32_t hi, lo, ni; + + hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H); + for (;;) { + lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L); + mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ); + ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H); + + if (ni == hi) + break; + + hi = ni; + } + + return (((uint64_t)hi << 32) | (uint64_t)lo); } static int