--- /sys/src/9k/ip/arp.c +++ /sys/src/9k/ip/arp.c @@ -523,10 +523,10 @@ enum char *aformat = "%-6.6s %-8.8s %-40.40I %-32.32s\n"; static void -convmac(char *p, uchar *mac, int n) +convmac(char *p, char *ep, uchar *mac, int n) { while(n-- > 0) - p += sprint(p, "%2.2ux", *mac++); + p = seprint(p, ep, "%2.2ux", *mac++); } int @@ -552,8 +552,9 @@ arpread(Arp *arp, char *p, ulong offset, int len) } len--; qlock(arp); - convmac(mac, a->mac, a->type->maclen); - n += sprint(p+n, aformat, a->type->name, arpstate[a->state], a->ip, mac); + convmac(mac, &mac[sizeof mac], a->mac, a->type->maclen); + n += snprint(p+n, Alinelen+1, aformat, a->type->name, + arpstate[a->state], a->ip, mac); /* +1 for NUL */ qunlock(arp); } --- /sys/src/9k/ip/chandial.c +++ /sys/src/9k/ip/chandial.c @@ -66,7 +66,7 @@ call(char *clone, char *dest, DS *ds) name[n] = 0; for(p = name; *p == ' '; p++) ; - sprint(name, "%lud", strtoul(p, 0, 0)); + snprint(name, sizeof name, "%lud", strtoul(p, 0, 0)); p = strrchr(clone, '/'); *p = 0; if(ds->dir) --- /sys/src/9k/ip/ethermedium.c +++ /sys/src/9k/ip/ethermedium.c @@ -9,6 +9,8 @@ #include "ip.h" #include "ipv6.h" +#include "etherif.h" + typedef struct Etherhdr Etherhdr; struct Etherhdr { @@ -94,9 +96,6 @@ struct Etherrock */ enum { - ETARP = 0x0806, - ETIP4 = 0x0800, - ETIP6 = 0x86DD, ARPREQUEST = 1, ARPREPLY = 2, }; @@ -415,7 +414,7 @@ etheraddmulti(Ipifc *ifc, uchar *a, uchar *) int version; version = multicastea(mac, a); - sprint(buf, "addmulti %E", mac); + snprint(buf, sizeof buf, "addmulti %E", mac); switch(version){ case V4: er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0); @@ -437,7 +436,7 @@ etherremmulti(Ipifc *ifc, uchar *a, uchar *) int version; version = multicastea(mac, a); - sprint(buf, "remmulti %E", mac); + snprint(buf, sizeof buf, "remmulti %E", mac); switch(version){ case V4: er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0); --- /sys/src/9k/ip/icmp.c +++ /sys/src/9k/ip/icmp.c @@ -368,7 +368,7 @@ icmpiput(Proto *icmp, Ipifc*, Block *bp) case EchoRequest: if (iplen < n) bp = trimblock(bp, 0, iplen); - r = mkechoreply(bp); + r = mkechoreply(concatblock(bp)); ipriv->out[EchoReply]++; ipoput4(icmp->f, r, 0, MAXTTL, DFLTTOS, nil); break; @@ -395,7 +395,7 @@ icmpiput(Proto *icmp, Ipifc*, Block *bp) break; case TimeExceed: if(p->code == 0){ - sprint(m2, "ttl exceeded at %V", p->src); + snprint(m2, sizeof m2, "ttl exceeded at %V", p->src); bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE; if(blocklen(bp) < MinAdvise){ --- /sys/src/9k/ip/icmp6.c +++ /sys/src/9k/ip/icmp6.c @@ -418,39 +418,37 @@ icmpna(Fs *f, uchar* src, uchar* dst, uchar* targ, uchar* mac, uchar flags) ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); } +/* if free is true, freeblist(bp) before return. */ extern void icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free) { - int osz = BLEN(bp); - int sz = MIN(IPICMPSZ + osz, v6MINTU); + int osz, sz; Block *nbp; IPICMP *np; + Icmppriv6 *ipriv; Ip6hdr *p; - Proto *icmp = f->t2p[ICMPv6]; - Icmppriv6 *ipriv = icmp->priv; + Proto *icmp; + osz = BLEN(bp); + sz = MIN(IPICMPSZ + osz, v6MINTU); + icmp = f->t2p[ICMPv6]; + ipriv = icmp->priv; p = (Ip6hdr *)bp->rp; - if(isv6mcast(p->src)) - goto clean; - + goto freebl; nbp = newIPICMP(sz); np = (IPICMP *)nbp->rp; rlock(ifc); - if(ipv6anylocal(ifc, np->src)) - netlog(f, Logicmp, "send icmphostunr -> src %I dst %I\n", - p->src, p->dst); - else { + if(!ipv6anylocal(ifc, np->src)){ netlog(f, Logicmp, "icmphostunr fail -> src %I dst %I\n", p->src, p->dst); + runlock(ifc); freeblist(nbp); - if(free) - goto clean; - else - return; + goto freebl; } + netlog(f, Logicmp, "send icmphostunr -> src %I dst %I\n", p->src, p->dst); memmove(np->dst, p->src, IPaddrlen); np->type = UnreachableV6; np->code = code; @@ -462,14 +460,12 @@ icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free) if(free) ipiput6(f, ifc, nbp); - else { + else ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); - return; - } - -clean: runlock(ifc); - freeblist(bp); +freebl: + if(free) + freeblist(bp); } extern void @@ -484,13 +480,11 @@ icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp) Icmppriv6 *ipriv = icmp->priv; p = (Ip6hdr *)bp->rp; - if(isv6mcast(p->src)) return; nbp = newIPICMP(sz); np = (IPICMP *) nbp->rp; - if(ipv6anylocal(ifc, np->src)) netlog(f, Logicmp, "send icmpttlexceeded6 -> src %I dst %I\n", p->src, p->dst); @@ -523,13 +517,11 @@ icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp) Icmppriv6 *ipriv = icmp->priv; p = (Ip6hdr *)bp->rp; - if(isv6mcast(p->src)) return; nbp = newIPICMP(sz); np = (IPICMP *)nbp->rp; - if(ipv6anylocal(ifc, np->src)) netlog(f, Logicmp, "send icmppkttoobig6 -> src %I dst %I\n", p->src, p->dst); @@ -555,15 +547,13 @@ icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp) * RFC 2461, pages 39-40, pages 57-58. */ static int -valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv) +valid(Proto *icmp, Ipifc *, Block *bp, Icmppriv6 *ipriv) { - int sz, osz, unsp, n, ttl, iplen; - int pktsz = BLEN(bp); - uchar *packet = bp->rp; - IPICMP *p = (IPICMP *) packet; + int sz, osz, unsp, n, ttl, iplen, pktsz; + uchar *packet; + IPICMP *p; Ndpkt *np; - USED(ifc); n = blocklen(bp); if(n < IPICMPSZ) { ipriv->stats[HlenErrs6]++; @@ -571,6 +561,9 @@ valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv) goto err; } + packet = bp->rp; + p = (IPICMP *)packet; + pktsz = BLEN(bp); iplen = nhgets(p->ploadlen); if(iplen > n - IP6HDR) { ipriv->stats[LenErrs6]++; @@ -597,91 +590,78 @@ valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv) p->ttl = ttl; /* additional tests for some pkt types */ - if (p->type == NbrSolicit || p->type == NbrAdvert || - p->type == RouterAdvert || p->type == RouterSolicit || - p->type == RedirectV6) { - if(p->ttl != HOP_LIMIT) { - ipriv->stats[HoplimErrs6]++; + if (p->type != NbrSolicit && p->type != NbrAdvert && + p->type != RouterAdvert && p->type != RouterSolicit && + p->type != RedirectV6) + return 1; /* TODO: unknown, presumed valid; why? */ + if(p->ttl != HOP_LIMIT) { + ipriv->stats[HoplimErrs6]++; + goto err; + } + if(p->code != 0) { + ipriv->stats[IcmpCodeErrs6]++; + goto err; + } + + switch (p->type) { + case NbrSolicit: + case NbrAdvert: + np = (Ndpkt*) p; + if(isv6mcast(np->target)) { + ipriv->stats[TargetErrs6]++; goto err; } - if(p->code != 0) { - ipriv->stats[IcmpCodeErrs6]++; + if(optexsts(np) && np->olen == 0) { + ipriv->stats[OptlenErrs6]++; goto err; } - - switch (p->type) { - case NbrSolicit: - case NbrAdvert: - np = (Ndpkt*) p; - if(isv6mcast(np->target)) { - ipriv->stats[TargetErrs6]++; + if (p->type == NbrSolicit && ipcmp(np->src, v6Unspecified) == 0) + if(!issmcast(np->dst) || optexsts(np)) { + ipriv->stats[AddrmxpErrs6]++; goto err; } - if(optexsts(np) && np->olen == 0) { + if(p->type == NbrAdvert && isv6mcast(np->dst) && + nhgets(np->icmpid) & Sflag){ + ipriv->stats[AddrmxpErrs6]++; + goto err; + } + break; + case RouterAdvert: + if(pktsz - IP6HDR < 16) { + ipriv->stats[HlenErrs6]++; + goto err; + } + if(!islinklocal(p->src)) { + ipriv->stats[RouterAddrErrs6]++; + goto err; + } + for (sz = IPICMPSZ + 8; sz+1 < pktsz; sz += 8*osz) { + osz = packet[sz+1]; + if(osz <= 0) { ipriv->stats[OptlenErrs6]++; goto err; } - - if (p->type == NbrSolicit && - ipcmp(np->src, v6Unspecified) == 0) - if(!issmcast(np->dst) || optexsts(np)) { - ipriv->stats[AddrmxpErrs6]++; - goto err; - } - - if(p->type == NbrAdvert) - if(isv6mcast(np->dst) && - (nhgets(np->icmpid) & Sflag)){ - ipriv->stats[AddrmxpErrs6]++; - goto err; - } - break; - - case RouterAdvert: - if(pktsz - IP6HDR < 16) { - ipriv->stats[HlenErrs6]++; - goto err; - } - if(!islinklocal(p->src)) { - ipriv->stats[RouterAddrErrs6]++; - goto err; - } - sz = IPICMPSZ + 8; - while (sz+1 < pktsz) { - osz = packet[sz+1]; - if(osz <= 0) { - ipriv->stats[OptlenErrs6]++; - goto err; - } - sz += 8*osz; - } - break; - - case RouterSolicit: - if(pktsz - IP6HDR < 8) { - ipriv->stats[HlenErrs6]++; + } + break; + case RouterSolicit: + if(pktsz - IP6HDR < 8) { + ipriv->stats[HlenErrs6]++; + goto err; + } + unsp = (ipcmp(p->src, v6Unspecified) == 0); + for (sz = IPICMPSZ + 8; sz+1 < pktsz; sz += 8*osz) { + osz = packet[sz+1]; + if(osz <= 0 || (unsp && packet[sz] == SRC_LLADDR)) { + ipriv->stats[OptlenErrs6]++; goto err; } - unsp = (ipcmp(p->src, v6Unspecified) == 0); - sz = IPICMPSZ + 8; - while (sz+1 < pktsz) { - osz = packet[sz+1]; - if(osz <= 0 || - (unsp && packet[sz] == SRC_LLADDR)) { - ipriv->stats[OptlenErrs6]++; - goto err; - } - sz += 8*osz; - } - break; - - case RedirectV6: - /* to be filled in */ - break; - - default: - goto err; } + break; + case RedirectV6: + /* TODO: fill in */ + break; + default: + goto err; } return 1; err: @@ -712,35 +692,39 @@ targettype(Fs *f, Ipifc *ifc, uchar *target) return 0; } +/* bp needs to be freed with freeblist or passed on. */ static void icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) { - int refresh = 1; + int type; char *msg, m2[128]; uchar pktflags; - uchar *packet = bp->rp; + uchar *packet, *src; uchar lsrc[IPaddrlen]; Block *r; - IPICMP *p = (IPICMP *)packet; - Icmppriv6 *ipriv = icmp->priv; + IPICMP *p; + Icmppriv6 *ipriv; Iplifc *lifc; Ndpkt* np; Proto *pr; - if(!valid(icmp, ipifc, bp, ipriv) || p->type > Maxtype6) + packet = bp->rp; + p = (IPICMP *)packet; + type = p->type; + ipriv = icmp->priv; + if(!valid(icmp, ipifc, bp, ipriv) || type > Maxtype6) goto raise; - ipriv->in[p->type]++; - - switch(p->type) { + ipriv->in[type]++; + switch(type) { case EchoRequestV6: + bp = concatblock(bp); r = mkechoreply6(bp, ipifc); if(r == nil) goto raise; ipriv->out[EchoReply]++; ipoput6(icmp->f, r, 0, MAXTTL, DFLTTOS, nil); break; - case UnreachableV6: if(p->code >= nelem(unreachcode)) msg = unreachcode[Icmp6_unknown]; @@ -762,11 +746,9 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) bp->rp -= IPICMPSZ; goticmpkt6(icmp, bp, 0); break; - case TimeExceedV6: if(p->code == 0){ - sprint(m2, "ttl exceeded at %I", p->src); - + snprint(m2, sizeof m2, "ttl exceeded at %I", p->src); bp->rp += IPICMPSZ; if(blocklen(bp) < 8){ ipriv->stats[LenErrs6]++; @@ -780,10 +762,8 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) } bp->rp -= IPICMPSZ; } - goticmpkt6(icmp, bp, 0); break; - case RouterAdvert: case RouterSolicit: /* using lsrc as a temp, munge hdr for goticmp6 */ @@ -792,43 +772,39 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) memmove(p->src, p->dst, IPaddrlen); memmove(p->dst, lsrc, IPaddrlen); } - goticmpkt6(icmp, bp, p->type); + goticmpkt6(icmp, bp, type); break; - case NbrSolicit: - np = (Ndpkt*) p; + np = (Ndpkt*)p; /* within bp */ pktflags = 0; switch (targettype(icmp->f, ipifc, np->target)) { case Tunirany: pktflags |= Oflag; /* fall through */ - case Tuniproxy: if(ipcmp(np->src, v6Unspecified) != 0) { arpenter(icmp->f, V6, np->src, np->lnaddr, 8*np->olen-2, 0); pktflags |= Sflag; } - if(ipv6local(ipifc, lsrc)) - icmpna(icmp->f, lsrc, - (ipcmp(np->src, v6Unspecified) == 0? - v6allnodesL: np->src), - np->target, ipifc->mac, pktflags); - else - freeblist(bp); + if(ipv6local(ipifc, lsrc)) { + src = np->src; + if(ipcmp(src, v6Unspecified) == 0) + src = v6allnodesL; + icmpna(icmp->f, lsrc, src, np->target, + ipifc->mac, pktflags); + } break; - case Tunitent: - /* not clear what needs to be done. send up - * an icmp mesg saying don't use this address? */ - default: - freeblist(bp); + /* + * not clear what needs to be done. send up + * an icmp mesg saying `don't use this address'? + */ + break; } + freeblist(bp); break; - case NbrAdvert: - np = (Ndpkt*) p; - /* * if the target address matches one of the local interface * addresses and the local interface address has tentative bit @@ -836,20 +812,19 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) * detection part of ipconfig can discover duplication through * the arp table. */ + np = (Ndpkt*)p; /* within bp */ lifc = iplocalonifc(ipifc, np->target); - if(lifc && lifc->tentative) - refresh = 0; arpenter(icmp->f, V6, np->target, np->lnaddr, 8*np->olen-2, - refresh); + lifc && lifc->tentative); freeblist(bp); break; - case PacketTooBigV6: default: goticmpkt6(icmp, bp, 0); break; } return; + raise: freeblist(bp); } @@ -870,14 +845,12 @@ icmpstats6(Proto *icmp6, char *buf, int len) if(icmpnames6[i]) p = seprint(p, e, "%s: %lud %lud\n", icmpnames6[i], priv->in[i], priv->out[i]); -/* else + else if (0) p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i], priv->out[i]); - */ return p - buf; } - /* import from icmp.c */ extern int icmpstate(Conv *c, char *state, int n); extern char* icmpannounce(Conv *c, char **argv, int argc); --- /sys/src/9k/ip/ip.c +++ /sys/src/9k/ip/ip.c @@ -215,7 +215,8 @@ ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c) return 0; } -if((eh->frag[0] & (IP_DF>>8)) && !gating) print("%V: DF set\n", eh->dst); + if((eh->frag[0] & (IP_DF>>8)) && !gating) + print("%V: DF set\n", eh->dst); if(eh->frag[0] & (IP_DF>>8)){ ip->stats[FragFails]++; --- /sys/src/9k/ip/ip.h +++ /sys/src/9k/ip/ip.h @@ -38,7 +38,7 @@ enum Addrlen= 64, Maxproto= 20, Nhash= 64, - Maxincall= 128, + Maxincall= 128, /* max. conn.s in listen q not accepted yet */ Nchans= 1024, MAClen= 16, /* longest mac address */ --- /sys/src/9k/ip/ipifc.c +++ /sys/src/9k/ip/ipifc.c @@ -16,6 +16,7 @@ enum { NHASH = 1<<6, NCACHE = 256, QMAX = 192*1024-1, + Maxv6repr = (128/(4*4))*(4+1), /* limit of xxxx:xxxx:⋯ notation */ }; Medium *media[Maxmedia] = { 0 }; @@ -1611,7 +1612,7 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc) { int plen = 64; long origint = NOW / 1000, preflt = ~0L, validlt = ~0L; - char addr[40], preflen[6]; + char addr[Maxv6repr], preflen[6]; char *params[3]; uchar autoflag = 1, onlink = 1; uchar prefix[IPaddrlen]; @@ -1639,9 +1640,17 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc) return Ebadarg; } - if (parseip(prefix, argv[1]) != 6 || validlt < preflt || plen < 0 || - plen > 64 || islinklocal(prefix)) - return Ebadarg; + if (parseip(prefix, argv[1]) != 6) + return "bad ipv6 address"; + if (validlt < preflt) + return "valid ipv6 lifetime less than preferred lifetime"; + if (plen < 0) + return "negative ipv6 prefix length"; + /* i think that this length limit is bogus - geoff */ +// if (plen > 64) +// return "ipv6 prefix length greater than 64; + if (islinklocal(prefix)) + return "ipv6 prefix is link-local"; lifc = smalloc(sizeof(Iplifc)); lifc->onlink = (onlink != 0); @@ -1652,10 +1661,10 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc) /* issue "add" ctl msg for v6 link-local addr and prefix len */ if(!ifc->medium->pref2addr) - return Ebadarg; + return "no pref2addr on interface"; ifc->medium->pref2addr(prefix, ifc->mac); /* mac → v6 link-local addr */ - sprint(addr, "%I", prefix); - sprint(preflen, "/%d", plen); + snprint(addr, sizeof addr, "%I", prefix); + snprint(preflen, sizeof preflen, "/%d", plen); params[0] = "add"; params[1] = addr; params[2] = preflen; --- /sys/src/9k/ip/ipv6.c +++ /sys/src/9k/ip/ipv6.c @@ -259,7 +259,7 @@ ipiput6(Fs *f, Ipifc *ifc, Block *bp) tentative = iptentative(f, v6dst); if(tentative && h->proto != ICMPv6) { - print("tentative addr, drop\n"); + print("ipv6 non-icmp tentative addr %I, drop\n", v6dst); freeblist(bp); return; } --- /sys/src/9k/ip/ipv6.h +++ /sys/src/9k/ip/ipv6.h @@ -20,10 +20,6 @@ #define optexsts(np) (nhgets((np)->ploadlen) > 24) #define issmcast(addr) (memcmp((addr), v6solicitednode, 13) == 0) -#ifndef MIN -#define MIN(a, b) ((a) <= (b)? (a): (b)) -#endif - enum { /* Header Types */ HBH = 0, /* hop-by-hop multicast routing protocol */ ICMP = 1, --- /sys/src/9k/ip/tcp.c +++ /sys/src/9k/ip/tcp.c @@ -81,7 +81,13 @@ enum NLHT = 256, /* hash table size, must be a power of 2 */ LHTMASK = NLHT-1, - HaveWS = 1<<8, + /* + * window is 64kb * 2ⁿ + * these factors determine the ultimate bandwidth-delay product. + * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms. + */ + Maxqscale = 4, /* maximum queuing scale */ + Defadvscale = 4, /* default advertisement */ }; /* Must correspond to the enumeration above */ @@ -122,6 +128,7 @@ struct Tcp4hdr uchar tcplen[2]; uchar tcpsrc[4]; uchar tcpdst[4]; + /* same as v6 from here on */ uchar tcpsport[2]; uchar tcpdport[2]; uchar tcpseq[4]; @@ -143,6 +150,7 @@ struct Tcp6hdr uchar ttl; uchar tcpsrc[IPaddrlen]; uchar tcpdst[IPaddrlen]; + /* same as v4 from here on */ uchar tcpsport[2]; uchar tcpdport[2]; uchar tcpseq[4]; @@ -169,8 +177,9 @@ struct Tcp ulong seq; ulong ack; uchar flags; - ushort ws; /* window scale option (if not zero) */ - ulong wnd; + uchar update; + ushort ws; /* window scale option */ + ulong wnd; /* prescaled window*/ ushort urg; ushort mss; /* max segment size option (if not zero) */ ushort len; /* size of data */ @@ -205,44 +214,57 @@ struct Tcpctl ulong wnd; /* Tcp send window */ ulong urg; /* Urgent data pointer */ ulong wl2; - int scale; /* how much to right shift window in xmitted packets */ + uint scale; /* how much to right shift window */ + /* in xmitted packets */ /* to implement tahoe and reno TCP */ ulong dupacks; /* number of duplicate acks rcvd */ + ulong partialack; int recovery; /* loss recovery flag */ + int retransmit; /* retransmit 1 packet @ una flag */ + int rto; ulong rxt; /* right window marker for recovery */ + /* "recover" rfc3782 */ } snd; struct { ulong nxt; /* Receive pointer to next uchar slot */ ulong wnd; /* Receive window incoming */ + ulong wsnt; /* Last wptr sent. important to */ + /* track for large bdp */ + ulong wptr; ulong urg; /* Urgent pointer */ + ulong ackptr; /* last acked sequence */ int blocked; - int una; /* unacked data segs */ - int scale; /* how much to left shift window in rcved packets */ + uint scale; /* how much to left shift window in */ + /* rcv'd packets */ } rcv; ulong iss; /* Initial sequence number */ - int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ ulong cwind; /* Congestion window */ - int scale; /* desired snd.scale */ - ushort ssthresh; /* Slow start threshold */ + ulong abcbytes; /* appropriate byte counting rfc 3465 */ + uint scale; /* desired snd.scale */ + ulong ssthresh; /* Slow start threshold */ int resent; /* Bytes just resent */ int irs; /* Initial received squence */ ushort mss; /* Maximum segment size */ int rerecv; /* Overlap of data rerecevived */ - ulong window; /* Receive window */ + ulong window; /* Our receive window (queue) */ + uint qscale; /* Log2 of our receive window (queue) */ uchar backoff; /* Exponential backoff counter */ int backedoff; /* ms we've backed off for rexmits */ uchar flags; /* State flags */ Reseq *reseq; /* Resequencing queue */ + int nreseq; + int reseqlen; Tcptimer timer; /* Activity timer */ Tcptimer acktimer; /* Acknowledge timer */ Tcptimer rtt_timer; /* Round trip timer */ Tcptimer katimer; /* keep alive timer */ ulong rttseq; /* Round trip sequence */ - int srtt; /* Shortened round trip */ + int srtt; /* Smoothed round trip */ int mdev; /* Mean deviation of round trip */ int kacounter; /* count down for keep alive */ uint sndsyntime; /* time syn sent */ ulong time; /* time Finwait2 or Syn_received was sent */ + ulong timeuna; /* snd.una when time was set */ int nochecksum; /* non-zero means don't send checksums */ int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ @@ -285,7 +307,6 @@ struct Limbo }; int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ -ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ enum { /* MIB stats */ @@ -298,6 +319,7 @@ enum { InSegs, OutSegs, RetransSegs, + RetransSegsSent, RetransTimeouts, InErrs, OutRsts, @@ -306,12 +328,24 @@ enum { CsumErrs, HlenErrs, LenErrs, + Resequenced, OutOfOrder, + ReseqBytelim, + ReseqPktlim, + Delayack, + Wopenack, + + Recovery, + RecoveryDone, + RecoveryRTO, + RecoveryNoSeq, + RecoveryCwind, + RecoveryPA, Nstats }; -static char *statnames[] = +static char *statnames[Nstats] = { [MaxConn] "MaxConn", [Mss] "MaxSegment", @@ -322,6 +356,7 @@ static char *statnames[] = [InSegs] "InSegs", [OutSegs] "OutSegs", [RetransSegs] "RetransSegs", +[RetransSegsSent] "RetransSegsSent", [RetransTimeouts] "RetransTimeouts", [InErrs] "InErrs", [OutRsts] "OutRsts", @@ -329,6 +364,19 @@ static char *statnames[] = [HlenErrs] "HlenErrs", [LenErrs] "LenErrs", [OutOfOrder] "OutOfOrder", +[Resequenced] "Resequenced", +[ReseqBytelim] "ReseqBytelim", +[ReseqPktlim] "ReseqPktlim", +[Delayack] "Delayack", +[Wopenack] "Wopenack", + +[Recovery] "Recovery", +[RecoveryDone] "RecoveryDone", +[RecoveryRTO] "RecoveryRTO", + +[RecoveryNoSeq] "RecoveryNoSeq", +[RecoveryCwind] "RecoveryCwind", +[RecoveryPA] "RecoveryPA", }; typedef struct Tcppriv Tcppriv; @@ -363,27 +411,27 @@ struct Tcppriv */ int tcpporthogdefense = 0; -static int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); -static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); -static void localclose(Conv*, char*); -static void procsyn(Conv*, Tcp*); -static void tcpacktimer(void*); -static void tcpiput(Proto*, Ipifc*, Block*); -static void tcpkeepalive(void*); -static void tcpoutput(Conv*); -static void tcprcvwin(Conv*); -static void tcprxmit(Conv*); -static void tcpsetkacounter(Tcpctl*); -static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); -static void tcpsettimer(Tcpctl*); -static void tcpsndsyn(Conv*, Tcpctl*); -static void tcpstart(Conv*, int); -static void tcpsynackrtt(Conv*); -static void tcptimeout(void*); -static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); - -static void limborexmit(Proto*); -static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); +static int dumpreseq(Tcpctl*); +static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); +static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static void limborexmit(Proto*); +static void localclose(Conv*, char*); +static void procsyn(Conv*, Tcp*); +static void tcpacktimer(void*); +static void tcpiput(Proto*, Ipifc*, Block*); +static void tcpkeepalive(void*); +static void tcpoutput(Conv*); +static void tcprcvwin(Conv*); +static void tcprxmit(Conv*); +static void tcpsetkacounter(Tcpctl*); +static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); +static void tcpsettimer(Tcpctl*); +static void tcpsndsyn(Conv*, Tcpctl*); +static void tcpstart(Conv*, int); +static void tcpsynackrtt(Conv*); +static void tcptimeout(void*); +static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); static void tcpsetstate(Conv *s, uchar newstate) @@ -405,11 +453,6 @@ tcpsetstate(Conv *s, uchar newstate) if(newstate == Established) tpriv->stats[CurrEstab]++; - /** - print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, - tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); - **/ - switch(newstate) { case Closed: qclose(s->rq); @@ -454,12 +497,16 @@ tcpstate(Conv *c, char *state, int n) s = (Tcpctl*)(c->ptcl); return snprint(state, n, - "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", + "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud " + "swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d " + "timer.count %d rerecv %d katimer.start %d katimer.count %d\n", tcpstates[s->state], c->rq ? qlen(c->rq) : 0, c->wq ? qlen(c->wq) : 0, - s->srtt, s->mdev, + s->nreseq, s->reseqlen, + s->srtt, s->mdev, s->ssthresh, s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, + s->qscale, s->timer.start, s->timer.count, s->rerecv, s->katimer.start, s->katimer.count); } @@ -558,7 +605,6 @@ tcpkick(void *x) /* * Push data */ - tcprcvwin(s); tcpoutput(s); break; default: @@ -570,6 +616,8 @@ tcpkick(void *x) poperror(); } +static int seq_lt(ulong, ulong); + static void tcprcvwin(Conv *s) /* Call with tcb locked */ { @@ -580,11 +628,17 @@ tcprcvwin(Conv *s) /* Call with tcb locked */ w = tcb->window - qlen(s->rq); if(w < 0) w = 0; - if(w == 0) - netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); - tcb->rcv.wnd = w; - if(w == 0) + /* RFC 1122 § 4.2.2.17 do not move right edge of window left */ + if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr)) + w = tcb->rcv.wptr - tcb->rcv.nxt; + if(w != tcb->rcv.wnd) + if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ tcb->rcv.blocked = 1; + netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n", + tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport); + } + tcb->rcv.wnd = w; + tcb->rcv.wptr = tcb->rcv.nxt + w; } static void @@ -603,7 +657,6 @@ tcpacktimer(void *v) qlock(s); if(tcb->state != Closed){ tcb->flags |= FORCE; - tcprcvwin(s); tcpoutput(s); } qunlock(s); @@ -611,10 +664,51 @@ tcpacktimer(void *v) } static void +tcpcongestion(Tcpctl *tcb) +{ + ulong inflight; + + inflight = tcb->snd.nxt - tcb->snd.una; + if(inflight > tcb->cwind) + inflight = tcb->cwind; + tcb->ssthresh = inflight / 2; + if(tcb->ssthresh < 2*tcb->mss) + tcb->ssthresh = 2*tcb->mss; +} + +enum { + L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */ +}; + +static void +tcpabcincr(Tcpctl *tcb, uint acked) +{ + uint limit; + + tcb->abcbytes += acked; + if(tcb->cwind < tcb->ssthresh){ + /* slow start */ + if(tcb->snd.rto) + limit = tcb->mss; + else + limit = L*tcb->mss; + tcb->cwind += MIN(tcb->abcbytes, limit); + tcb->abcbytes = 0; + } else { + tcb->snd.rto = 0; + /* avoidance */ + if(tcb->abcbytes >= tcb->cwind){ + tcb->abcbytes -= tcb->cwind; + tcb->cwind += tcb->mss; + } + } +} + +static void tcpcreate(Conv *c) { c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); - c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); + c->wq = qopen(QMAX, Qkick, tcpkick, c); } static void @@ -728,7 +822,6 @@ static void localclose(Conv *s, char *reason) /* called with tcb locked */ { Tcpctl *tcb; - Reseq *rp,*rp1; Tcppriv *tpriv; tpriv = s->p->priv; @@ -742,12 +835,7 @@ localclose(Conv *s, char *reason) /* called with tcb locked */ tcphalt(tpriv, &tcb->katimer); /* Flush reassembly queue; nothing more can arrive */ - for(rp = tcb->reseq; rp != nil; rp = rp1) { - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; + dumpreseq(tcb); if(tcb->state == Syn_sent) Fsconnected(s, reason); @@ -762,7 +850,7 @@ localclose(Conv *s, char *reason) /* called with tcb locked */ /* mtu (- TCP + IP hdr len) of 1st hop */ static int -tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) +tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale) { Ipifc *ifc; int mtu; @@ -781,17 +869,11 @@ tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE); break; } - if(ifc != nil){ - if(ifc->mbps > 1000) - *scale = HaveWS | 4; - else if(ifc->mbps > 100) - *scale = HaveWS | 3; - else if(ifc->mbps > 10) - *scale = HaveWS | 1; - else - *scale = HaveWS | 0; - } else - *scale = HaveWS | 0; + /* + * set the ws. it doesn't commit us to anything. + * ws is the ultimate limit to the bandwidth-delay product. + */ + *scale = Defadvscale; return mtu; } @@ -809,7 +891,7 @@ inittcpctl(Conv *s, int mode) memset(tcb, 0, sizeof(Tcpctl)); - tcb->ssthresh = 65535; + tcb->ssthresh = QMAX; /* reset by tcpsetscale() */ tcb->srtt = tcp_irtt<mdev = 0; @@ -858,15 +940,12 @@ inittcpctl(Conv *s, int mode) } tcb->mss = tcb->cwind = mss; + tcb->abcbytes = 0; tpriv = s->p->priv; tpriv->stats[Mss] = tcb->mss; /* default is no window scaling */ - tcb->window = QMAX; - tcb->rcv.wnd = QMAX; - tcb->rcv.scale = 0; - tcb->snd.scale = 0; - qsetlimit(s->rq, QMAX); + tcpsetscale(s, tcb, 0, 0); } /* @@ -884,7 +963,7 @@ tcpstart(Conv *s, int mode) if(tpriv->ackprocstarted == 0){ qlock(&tpriv->apl); if(tpriv->ackprocstarted == 0){ - sprint(kpname, "#I%dtcpack", s->p->f->dev); + snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev); kproc(kpname, tcpackproc, s->p); tpriv->ackprocstarted = 1; } @@ -914,24 +993,24 @@ tcpstart(Conv *s, int mode) } static char* -tcpflag(ushort flag) +tcpflag(char *buf, char *e, ushort flag) { - static char buf[128]; + char *p; - sprint(buf, "%d", flag>>10); /* Head len */ + p = seprint(buf, e, "%d", flag>>10); /* Head len */ if(flag & URG) - strcat(buf, " URG"); + p = seprint(p, e, " URG"); if(flag & ACK) - strcat(buf, " ACK"); + p = seprint(p, e, " ACK"); if(flag & PSH) - strcat(buf, " PSH"); + p = seprint(p, e, " PSH"); if(flag & RST) - strcat(buf, " RST"); + p = seprint(p, e, " RST"); if(flag & SYN) - strcat(buf, " SYN"); + p = seprint(p, e, " SYN"); if(flag & FIN) - strcat(buf, " FIN"); - + p = seprint(p, e, " FIN"); + USED(p); return buf; } @@ -992,7 +1071,6 @@ htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) *opt++ = MSSOPT; *opt++ = MSS_LENGTH; hnputs(opt, tcph->mss); -// print("our outgoing mss %d\n", tcph->mss); opt += 2; } if(tcph->ws != 0){ @@ -1033,7 +1111,7 @@ htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) if(tcph->flags & SYN){ if(tcph->mss) hdrlen += MSS_LENGTH; - if(tcph->ws) + if(1) hdrlen += WS_LENGTH; optpad = hdrlen & 3; if(optpad) @@ -1075,7 +1153,8 @@ htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) hnputs(opt, tcph->mss); opt += 2; } - if(tcph->ws != 0){ + /* always offer. rfc1323 §2.2 */ + if(1){ *opt++ = WSOPT; *opt++ = WS_LENGTH; *opt++ = tcph->ws; @@ -1123,6 +1202,7 @@ ntohtcp6(Tcp *tcph, Block **bpp) tcph->urg = nhgets(h->tcpurg); tcph->mss = 0; tcph->ws = 0; + tcph->update = 0; tcph->len = nhgets(h->ploadlen) - hdrlen; *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); @@ -1147,7 +1227,7 @@ ntohtcp6(Tcp *tcph, Block **bpp) break; case WSOPT: if(optlen == WS_LENGTH && *(optr+2) <= 14) - tcph->ws = HaveWS | *(optr+2); + tcph->ws = *(optr+2); break; } n -= optlen; @@ -1186,6 +1266,7 @@ ntohtcp4(Tcp *tcph, Block **bpp) tcph->urg = nhgets(h->tcpurg); tcph->mss = 0; tcph->ws = 0; + tcph->update = 0; tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); @@ -1205,14 +1286,12 @@ ntohtcp4(Tcp *tcph, Block **bpp) break; switch(*optr) { case MSSOPT: - if(optlen == MSS_LENGTH) { + if(optlen == MSS_LENGTH) tcph->mss = nhgets(optr+2); -// print("new incoming mss %d\n", tcph->mss); - } break; case WSOPT: if(optlen == WS_LENGTH && *(optr+2) <= 14) - tcph->ws = HaveWS | *(optr+2); + tcph->ws = *(optr+2); break; } n -= optlen; @@ -1234,6 +1313,7 @@ tcpsndsyn(Conv *s, Tcpctl *tcb) tcb->rttseq = tcb->iss; tcb->snd.wl2 = tcb->iss; tcb->snd.una = tcb->iss; + tcb->snd.rxt = tcb->iss; tcb->snd.ptr = tcb->rttseq; tcb->snd.nxt = tcb->rttseq; tcb->flgcnt++; @@ -1348,7 +1428,7 @@ tcphangup(Conv *s) memset(&seg, 0, sizeof seg); seg.flags = RST | ACK; seg.ack = tcb->rcv.nxt; - tcb->rcv.una = 0; + tcb->rcv.ackptr = seg.ack; seg.seq = tcb->snd.ptr; seg.wnd = 0; seg.urg = 0; @@ -1386,7 +1466,7 @@ sndsynack(Proto *tcp, Limbo *lp) Tcp4hdr ph4; Tcp6hdr ph6; Tcp seg; - int scale; + uint scale; /* make pseudo header */ switch(lp->version) { @@ -1420,8 +1500,6 @@ sndsynack(Proto *tcp, Limbo *lp) seg.flags = SYN|ACK; seg.urg = 0; seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); -// if (seg.mss > lp->mss && lp->mss >= 512) -// seg.mss = lp->mss; seg.wnd = QMAX; /* if the other side set scale, we should too */ @@ -1599,6 +1677,18 @@ limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) } } +static void +initialwindow(Tcpctl *tcb) +{ + /* RFC 3390 initial window */ + if(tcb->mss < 1095) + tcb->cwind = 4*tcb->mss; + else if(tcb->mss < 2190) + tcb->cwind = 2*2190; + else + tcb->cwind = 2*tcb->mss; +} + /* * come here when we finally get an ACK to our SYN-ACK. * lookup call in limbo. if found, create a new conversation @@ -1670,6 +1760,8 @@ tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) tcb->irs = lp->irs; tcb->rcv.nxt = tcb->irs+1; + tcb->rcv.wptr = tcb->rcv.nxt; + tcb->rcv.wsnt = 0; tcb->rcv.urg = tcb->rcv.nxt; tcb->iss = lp->iss; @@ -1678,6 +1770,7 @@ tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) tcb->snd.una = tcb->iss+1; tcb->snd.ptr = tcb->iss+1; tcb->snd.nxt = tcb->iss+1; + tcb->snd.rxt = tcb->iss+1; tcb->flgcnt = 0; tcb->flags |= SYNACK; @@ -1690,9 +1783,9 @@ tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) /* window scaling */ tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); - /* the congestion window always starts out as a single segment */ + /* congestion window */ tcb->snd.wnd = segp->wnd; - tcb->cwind = tcb->mss; + initialwindow(tcb); /* set initial round trip time */ tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; @@ -1797,40 +1890,51 @@ update(Conv *s, Tcp *seg) int rtt, delta; Tcpctl *tcb; ulong acked; - ulong expand; Tcppriv *tpriv; - tpriv = s->p->priv; - tcb = (Tcpctl*)s->ptcl; - - /* if everything has been acked, force output(?) */ - if(seq_gt(seg->ack, tcb->snd.nxt)) { - tcb->flags |= FORCE; + if(seg->update) return; - } - - /* added by Dong Lin for fast retransmission */ - if(seg->ack == tcb->snd.una - && tcb->snd.una != tcb->snd.nxt - && seg->len == 0 - && seg->wnd == tcb->snd.wnd) { + seg->update = 1; - /* this is a pure ack w/o window update */ - netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n", - tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); + tpriv = s->p->priv; + tcb = (Tcpctl*)s->ptcl; - if(++tcb->snd.dupacks == TCPREXMTTHRESH) { - /* - * tahoe tcp rxt the packet, half sshthresh, - * and set cwnd to one packet - */ + /* catch zero-window updates, update window & recover */ + if(tcb->snd.wnd == 0 && seg->wnd > 0 && + seq_lt(seg->ack, tcb->snd.ptr)){ + netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n", + seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd); + tcb->snd.wnd = seg->wnd; + goto recovery; + } + + /* newreno fast retransmit */ + if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt && + ++tcb->snd.dupacks == 3){ /* was TCPREXMTTHRESH */ +recovery: + if(tcb->snd.recovery){ + tpriv->stats[RecoveryCwind]++; + tcb->cwind += tcb->mss; + }else if(seq_le(tcb->snd.rxt, seg->ack)){ + tpriv->stats[Recovery]++; + tcb->abcbytes = 0; tcb->snd.recovery = 1; + tcb->snd.partialack = 0; tcb->snd.rxt = tcb->snd.nxt; - netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); + tcpcongestion(tcb); + tcb->cwind = tcb->ssthresh + 3*tcb->mss; + netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n", + tcb->cwind, tcb->ssthresh, tcb->snd.rxt); tcprxmit(s); - } else { - /* do reno tcp here. */ + }else{ + tpriv->stats[RecoveryNoSeq]++; + netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n", + tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack); + /* don't enter fast retransmit, don't change ssthresh */ } + }else if(tcb->snd.recovery){ + tpriv->stats[RecoveryCwind]++; + tcb->cwind += tcb->mss; } /* @@ -1838,6 +1942,9 @@ update(Conv *s, Tcp *seg) */ if(seq_gt(seg->ack, tcb->snd.wl2) || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ + /* clear dupack if we advance wl2 */ + if(tcb->snd.wl2 != seg->ack) + tcb->snd.dupacks = 0; tcb->snd.wnd = seg->wnd; tcb->snd.wl2 = seg->ack; } @@ -1847,22 +1954,11 @@ update(Conv *s, Tcp *seg) * don't let us hangup if sending into a closed window and * we're still getting acks */ - if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ + if((tcb->flags&RETRAN) && tcb->snd.wnd == 0) tcb->backedoff = MAXBACKMS/4; - } return; } - /* - * any positive ack turns off fast rxt, - * (should we do new-reno on partial acks?) - */ - if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { - tcb->snd.dupacks = 0; - tcb->snd.recovery = 0; - } else - netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind); - /* Compute the new send window size */ acked = seg->ack - tcb->snd.una; @@ -1874,24 +1970,41 @@ update(Conv *s, Tcp *seg) goto done; } - /* slow start as long as we're not recovering from lost packets */ - if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { - if(tcb->cwind < tcb->ssthresh) { - expand = tcb->mss; - if(acked < expand) - expand = acked; - } - else - expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; + /* + * congestion control + */ + if(tcb->snd.recovery){ + if(seq_ge(seg->ack, tcb->snd.rxt)){ + /* recovery finished; deflate window */ + tpriv->stats[RecoveryDone]++; + tcb->snd.dupacks = 0; + tcb->snd.recovery = 0; + tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss; + if(tcb->ssthresh < tcb->cwind) + tcb->cwind = tcb->ssthresh; + netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n", + tcb->cwind, tcb->ssthresh); + } else { + /* partial ack; we lost more than one segment */ + tpriv->stats[RecoveryPA]++; + if(tcb->cwind > acked) + tcb->cwind -= acked; + else{ + netlog(s->p->f, Logtcpwin, "partial ack neg\n"); + tcb->cwind = tcb->mss; + } + netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n", + acked, tcb->snd.rxt - seg->ack, tcb->cwind); - if(tcb->cwind + expand < tcb->cwind) - expand = tcb->snd.wnd - tcb->cwind; - if(tcb->cwind + expand > tcb->snd.wnd) - expand = tcb->snd.wnd - tcb->cwind; - tcb->cwind += expand; - } + if(acked >= tcb->mss) + tcb->cwind += tcb->mss; + tcb->snd.partialack++; + } + } else + tcpabcincr(tcb, acked); /* Adjust the timers according to the round trip time */ + /* TODO: fix sloppy treatment of overflow cases here. */ if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { tcphalt(tpriv, &tcb->rtt_timer); if((tcb->flags&RETRAN) == 0) { @@ -1899,7 +2012,7 @@ update(Conv *s, Tcp *seg) tcb->backedoff = 0; rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; if(rtt == 0) - rtt = 1; /* otherwise all close systems will rexmit in 0 time */ + rtt = 1; /* else all close sys's will rexmit in 0 time */ rtt *= MSPTICK; if(tcb->srtt == 0) { tcb->srtt = rtt << LOGAGAIN; @@ -1922,20 +2035,30 @@ update(Conv *s, Tcp *seg) done: if(qdiscard(s->wq, acked) < acked) tcb->flgcnt--; - tcb->snd.una = seg->ack; + + /* newreno fast recovery */ + if(tcb->snd.recovery) + tcprxmit(s); + if(seq_gt(seg->ack, tcb->snd.urg)) tcb->snd.urg = seg->ack; - if(tcb->snd.una != tcb->snd.nxt) - tcpgo(tpriv, &tcb->timer); - else + if(tcb->snd.una != tcb->snd.nxt){ + /* `impatient' variant */ + if(!tcb->snd.recovery || tcb->snd.partialack == 1){ + tcb->time = NOW; + tcb->timeuna = tcb->snd.una; + tcpgo(tpriv, &tcb->timer); + } + } else tcphalt(tpriv, &tcb->timer); if(seq_lt(tcb->snd.ptr, tcb->snd.una)) tcb->snd.ptr = tcb->snd.una; - tcb->flags &= ~RETRAN; + if(!tcb->snd.recovery) + tcb->flags &= ~RETRAN; tcb->backoff = 0; tcb->backedoff = 0; } @@ -1962,7 +2085,6 @@ tcpiput(Proto *tcp, Ipifc*, Block *bp) h4 = (Tcp4hdr*)(bp->rp); h6 = (Tcp6hdr*)(bp->rp); - memset(&seg, 0, sizeof seg); if((h4->vihl&0xF0)==IP_VER4) { version = V4; @@ -2171,11 +2293,13 @@ reset: } /* Cut the data to fit the receive window */ + tcprcvwin(s); if(tcptrim(tcb, &seg, &bp, &length) == -1) { - netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n", - seg.seq, seg.seq + length - 1, - tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr); - netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); + if(seg.seq+1 != tcb->rcv.nxt || length != 1) + netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win " + "%lud-%lud l %d from %I\n", seg.seq, + seg.seq + length - 1, tcb->rcv.nxt, + tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr); update(s, &seg); if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { tcphalt(tpriv, &tcb->rtt_timer); @@ -2206,12 +2330,16 @@ reset: if(seg.seq != tcb->rcv.nxt) if(length != 0 || (seg.flags & (SYN|FIN))) { update(s, &seg); - if(addreseq(tcb, tpriv, &seg, bp, length) < 0) - print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); - tcb->flags |= FORCE; + if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) + print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, + s->laddr, s->lport); + tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */ goto output; } + if(tcb->nreseq > 0) + tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */ + /* * keep looping till we've processed this packet plus any * adjacent packets in the resequence queue @@ -2221,7 +2349,11 @@ reset: if(tcb->state == Established) { tpriv->stats[EstabResets]++; if(tcb->rcv.nxt != seg.seq) - print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); + netlog(f, Logtcp, "out of order RST " + "rcvd: %I.%d -> %I.%d, rcv.nxt " + "%lux seq %lux\n", + s->raddr, s->rport, s->laddr, + s->lport, tcb->rcv.nxt, seg.seq); } localclose(s, Econrefused); goto raise; @@ -2315,29 +2447,10 @@ reset: panic("tcp packblock"); qpassnolim(s->rq, bp); bp = nil; - - /* - * Force an ack every 2 data messages. This is - * a hack for rob to make his home system run - * faster. - * - * this also keeps the standard TCP congestion - * control working since it needs an ack every - * 2 max segs worth. This is not quite that, - * but under a real stream is equivalent since - * every packet has a max seg in it. - */ - if(++(tcb->rcv.una) >= 2) - tcb->flags |= FORCE; } tcb->rcv.nxt += length; /* - * update our rcv window - */ - tcprcvwin(s); - - /* * turn on the acktimer if there's something * to ack */ @@ -2411,8 +2524,11 @@ reset: getreseq(tcb, &seg, &bp, &length); - if(tcptrim(tcb, &seg, &bp, &length) == 0) + tcprcvwin(s); + if(tcptrim(tcb, &seg, &bp, &length) == 0){ + tcb->flags |= FORCE; break; + } } } output: @@ -2436,11 +2552,11 @@ static void tcpoutput(Conv *s) { Tcp seg; - int msgs; + uint msgs; Tcpctl *tcb; Block *hbp, *bp; - int sndcnt, n; - ulong ssize, dsize, usable, sent; + int sndcnt; + ulong ssize, dsize, sent; Fs *f; Tcppriv *tpriv; uchar version; @@ -2448,11 +2564,26 @@ tcpoutput(Conv *s) f = s->p->f; tpriv = s->p->priv; version = s->ipversion; - memset(&seg, 0, sizeof seg); - for(msgs = 0; msgs < 100; msgs++) { - tcb = (Tcpctl*)s->ptcl; + tcb = (Tcpctl*)s->ptcl; + + /* force ack every 2*mss */ + if((tcb->flags & FORCE) == 0 && + tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){ + tpriv->stats[Delayack]++; + tcb->flags |= FORCE; + } + /* force ack if window opening */ + if((tcb->flags & FORCE) == 0){ + tcprcvwin(s); + if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){ + tpriv->stats[Wopenack]++; + tcb->flags |= FORCE; + } + } + + for(msgs = 0; msgs < 100; msgs++) { switch(tcb->state) { case Listen: case Closed: @@ -2460,7 +2591,12 @@ tcpoutput(Conv *s) return; } + /* Don't send anything else until our SYN has been acked */ + if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) + break; + /* force an ack when a window has opened up */ + tcprcvwin(s); if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ tcb->rcv.blocked = 0; tcb->flags |= FORCE; @@ -2468,55 +2604,53 @@ tcpoutput(Conv *s) sndcnt = qlen(s->wq)+tcb->flgcnt; sent = tcb->snd.ptr - tcb->snd.una; - - /* Don't send anything else until our SYN has been acked */ - if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) - break; - - /* Compute usable segment based on offered window and limit - * window probes to one - */ + ssize = sndcnt; if(tcb->snd.wnd == 0){ - if(sent != 0) { - if((tcb->flags&FORCE) == 0) - break; -// tcb->snd.ptr = tcb->snd.una; + /* zero window probe */ + if(sent > 0 && !(tcb->flags & FORCE)) + break; /* already probing, rto re-probes */ + if(ssize < sent) + ssize = 0; + else{ + ssize -= sent; + if(ssize > 0) + ssize = 1; + } + } else { + /* calculate usable segment size */ + if(ssize > tcb->cwind) + ssize = tcb->cwind; + if(ssize > tcb->snd.wnd) + ssize = tcb->snd.wnd; + + if(ssize < sent) + ssize = 0; + else { + ssize -= sent; + if(ssize > tcb->mss) + ssize = tcb->mss; } - usable = 1; } - else { - usable = tcb->cwind; - if(tcb->snd.wnd < usable) - usable = tcb->snd.wnd; -// usable -= sent; - usable = usable >= sent? usable - sent: 0; - } - ssize = sndcnt-sent; - if(ssize && usable < 2) - netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", - tcb->snd.wnd, tcb->cwind); - if(usable < ssize) - ssize = usable; - if(tcb->mss < ssize) - ssize = tcb->mss; + dsize = ssize; seg.urg = 0; - if(ssize == 0) - if((tcb->flags&FORCE) == 0) - break; + if(!(tcb->flags & FORCE)) + if(ssize == 0 || + ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr && + sent > TCPREXMTTHRESH * tcb->mss) + break; tcb->flags &= ~FORCE; - tcprcvwin(s); /* By default we will generate an ack */ tcphalt(tpriv, &tcb->acktimer); - tcb->rcv.una = 0; seg.source = s->lport; seg.dest = s->rport; seg.flags = ACK; seg.mss = 0; seg.ws = 0; + seg.update = 0; switch(tcb->state){ case Syn_sent: seg.flags = 0; @@ -2556,20 +2690,9 @@ tcpoutput(Conv *s) } } - if(sent+dsize == sndcnt) + if(sent+dsize == sndcnt && dsize) seg.flags |= PSH; - /* keep track of balance of resent data */ - if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { - n = tcb->snd.nxt - tcb->snd.ptr; - if(ssize < n) - n = ssize; - tcb->resent += n; - netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n", - s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); - tpriv->stats[RetransSegs]++; - } - tcb->snd.ptr += ssize; /* Pull up the send pointer so we can accept acks @@ -2605,13 +2728,17 @@ tcpoutput(Conv *s) * expect acknowledges */ if(ssize != 0){ - if(tcb->timer.state != TcptimerON) + if(tcb->timer.state != TcptimerON){ + tcb->time = NOW; + tcb->timeuna = tcb->snd.una; tcpgo(tpriv, &tcb->timer); + } /* If round trip timer isn't running, start it. * measure the longest packet only in case the * transmission time dominates RTT */ + if(tcb->snd.retransmit == 0) if(tcb->rtt_timer.state != TcptimerON) if(ssize == tcb->mss) { tcpgo(tpriv, &tcb->rtt_timer); @@ -2620,6 +2747,10 @@ tcpoutput(Conv *s) } tpriv->stats[OutSegs]++; + if(tcb->snd.retransmit) + tpriv->stats[RetransSegsSent]++; + tcb->rcv.ackptr = seg.ack; + tcb->rcv.wsnt = tcb->rcv.wptr; /* put off the next keep alive */ tcpgo(tpriv, &tcb->katimer); @@ -2640,9 +2771,8 @@ tcpoutput(Conv *s) default: panic("tcpoutput2: version %d", version); } - if((msgs%4) == 1){ + if((msgs%4) == 3){ qunlock(s); - sched(); qlock(s); } } @@ -2673,7 +2803,8 @@ tcpsendka(Conv *s) else seg.seq = tcb->snd.una-1; seg.ack = tcb->rcv.nxt; - tcb->rcv.una = 0; + tcb->rcv.ackptr = seg.ack; + tcprcvwin(s); seg.wnd = tcb->rcv.wnd; if(tcb->state == Finwait2){ seg.flags |= FIN; @@ -2781,29 +2912,37 @@ tcpsetchecksum(Conv *s, char **f, int) return nil; } +/* + * retransmit (at most) one segment at snd.una. + * preserve cwind & snd.ptr + */ static void tcprxmit(Conv *s) { Tcpctl *tcb; + Tcppriv *tpriv; + ulong tcwind, tptr; tcb = (Tcpctl*)s->ptcl; - tcb->flags |= RETRAN|FORCE; - tcb->snd.ptr = tcb->snd.una; - /* - * We should be halving the slow start threshhold (down to one - * mss) but leaving it at mss seems to work well enough - */ - tcb->ssthresh = tcb->mss; - - /* - * pull window down to a single packet - */ + tptr = tcb->snd.ptr; + tcwind = tcb->cwind; + tcb->snd.ptr = tcb->snd.una; tcb->cwind = tcb->mss; + tcb->snd.retransmit = 1; tcpoutput(s); + tcb->snd.retransmit = 0; + tcb->cwind = tcwind; + tcb->snd.ptr = tptr; + + tpriv = s->p->priv; + tpriv->stats[RetransSegs]++; } +/* + * TODO: RFC 4138 F-RTO + */ static void tcptimeout(void *arg) { @@ -2833,11 +2972,29 @@ tcptimeout(void *arg) localclose(s, Etimedout); break; } - netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); + netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n", + tcb->srtt, tcb->mdev, NOW - tcb->time, + tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr, + tcpstates[s->state]); tcpsettimer(tcb); + if(tcb->snd.rto == 0) + tcpcongestion(tcb); tcprxmit(s); + tcb->snd.ptr = tcb->snd.una; + tcb->cwind = tcb->mss; + tcb->snd.rto = 1; tpriv->stats[RetransTimeouts]++; - tcb->snd.dupacks = 0; + + if(tcb->snd.recovery){ + tcb->snd.dupacks = 0; /* reno rto */ + tcb->snd.recovery = 0; + tpriv->stats[RecoveryRTO]++; + tcb->snd.rxt = tcb->snd.nxt; + netlog(s->p->f, Logtcpwin, + "rto recovery rxt @%lud\n", tcb->snd.nxt); + } + + tcb->abcbytes = 0; break; case Time_wait: localclose(s, nil); @@ -2868,6 +3025,8 @@ procsyn(Conv *s, Tcp *seg) tcb->flags |= FORCE; tcb->rcv.nxt = seg->seq + 1; + tcb->rcv.wptr = tcb->rcv.nxt; + tcb->rcv.wsnt = 0; tcb->rcv.urg = tcb->rcv.nxt; tcb->irs = seg->seq; @@ -2878,20 +3037,55 @@ procsyn(Conv *s, Tcp *seg) tpriv->stats[Mss] = tcb->mss; } - /* the congestion window always starts out as a single segment */ tcb->snd.wnd = seg->wnd; - tcb->cwind = tcb->mss; + initialwindow(tcb); +} + +static int +dumpreseq(Tcpctl *tcb) +{ + Reseq *r, *next; + + for(r = tcb->reseq; r != nil; r = next){ + next = r->next; + freeblist(r->bp); + free(r); + } + tcb->reseq = nil; + tcb->nreseq = 0; + tcb->reseqlen = 0; + return -1; +} + +static void +logreseq(Fs *f, Reseq *r, ulong n) +{ + char *s; + + for(; r != nil; r = r->next){ + s = nil; + if(r->next == nil && r->seg.seq != n) + s = "hole/end"; + else if(r->next == nil) + s = "end"; + else if(r->seg.seq != n) + s = "hole"; + if(s != nil) + netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s, + n, r->seg.seq, r->seg.seq - n, r->seg.flags); + n = r->seg.seq + r->seg.len; + } } static int -addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) +addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) { - Reseq *rp, *rp1; - int i, rqlen, qmax; + Reseq *rp, **rr; + int qmax; - rp = malloc(sizeof(Reseq)); + rp = malloc(sizeof *rp); if(rp == nil){ - freeblist(bp); /* bp always consumed by add_reseq */ + freeblist(bp); /* bp always consumed by addreseq */ return 0; } @@ -2899,53 +3093,35 @@ addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) rp->bp = bp; rp->length = length; - /* Place on reassembly list sorting by starting seq number */ - rp1 = tcb->reseq; - if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { - rp->next = rp1; - tcb->reseq = rp; - if(rp->next != nil) - tpriv->stats[OutOfOrder]++; - return 0; - } + tcb->reseqlen += length; + tcb->nreseq++; - rqlen = 0; - for(i = 0;; i++) { - rqlen += rp1->length; - if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { - rp->next = rp1->next; - rp1->next = rp; + /* Place on reassembly list sorting by starting seq number */ + for(rr = &tcb->reseq; ; rr = &(*rr)->next) + if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){ + rp->next = *rr; + *rr = rp; + tpriv->stats[Resequenced]++; if(rp->next != nil) tpriv->stats[OutOfOrder]++; break; } - rp1 = rp1->next; - } - qmax = QMAX<rcv.scale; - if(rqlen > qmax){ - print("resequence queue > window: %d > %d\n", rqlen, qmax); - i = 0; - for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ - print("%#lux %#lux %#ux\n", rp1->seg.seq, - rp1->seg.ack, rp1->seg.flags); - if(i++ > 10){ - print("...\n"); - break; - } - } - - /* - * delete entire reassembly queue; wait for retransmit. - * - should we be smarter and only delete the tail? - */ - for(rp = tcb->reseq; rp != nil; rp = rp1){ - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; - return -1; + qmax = tcb->window; + if(tcb->reseqlen > qmax){ + netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", + tcb->reseqlen, qmax, tcb->nreseq); + logreseq(f, tcb->reseq, tcb->rcv.nxt); + tpriv->stats[ReseqBytelim]++; + return dumpreseq(tcb); + } + qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */ + if(tcb->nreseq > qmax){ + netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", + tcb->nreseq, qmax, tcb->reseqlen); + logreseq(f, tcb->reseq, tcb->rcv.nxt); + tpriv->stats[ReseqPktlim]++; + return dumpreseq(tcb); } return 0; } @@ -2965,6 +3141,9 @@ getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) *bp = rp->bp; *length = rp->length; + tcb->nreseq--; + tcb->reseqlen -= rp->length; + free(rp); } @@ -3188,9 +3367,9 @@ tcpsettimer(Tcpctl *tcb) x = backoff(tcb->backoff) * (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; - /* bounded twixt 1/2 and 64 seconds */ - if(x < 500/MSPTICK) - x = 500/MSPTICK; + /* bounded twixt 0.3 and 64 seconds */ + if(x < 300/MSPTICK) + x = 300/MSPTICK; else if(x > (64000/MSPTICK)) x = 64000/MSPTICK; tcb->timer.start = x; @@ -3227,15 +3406,35 @@ tcpinit(Fs *fs) static void tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) { - if(rcvscale){ - tcb->rcv.scale = rcvscale & 0xff; - tcb->snd.scale = sndscale & 0xff; - tcb->window = QMAX<snd.scale; - qsetlimit(s->rq, tcb->window); - } else { - tcb->rcv.scale = 0; - tcb->snd.scale = 0; - tcb->window = QMAX; - qsetlimit(s->rq, tcb->window); - } + /* + * guess at reasonable queue sizes. there's no current way + * to know how many nic receive buffers we can safely tie up in the + * tcp stack, and we don't adjust our queues to maximize throughput + * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be + * respected, but we still control our own buffer commitment by + * keeping a seperate qscale. + */ + tcb->rcv.scale = rcvscale & 0xff; + tcb->snd.scale = sndscale & 0xff; + tcb->qscale = rcvscale & 0xff; + if(rcvscale > Maxqscale) + tcb->qscale = Maxqscale; + + if(rcvscale != tcb->rcv.scale) + netlog(s->p->f, Logtcp, "tcpsetscale: window %lud " + "qlen %d >> window %ud lport %d\n", + tcb->window, qlen(s->rq), QMAX<qscale, s->lport); + tcb->window = QMAX << tcb->qscale; + tcb->ssthresh = tcb->window; + + /* + * it's important to set wq large enough to cover the full + * bandwidth-delay product. it's possible to be in loss + * recovery with a big window, and we need to keep sending + * into the inflated window. the difference can be huge + * for even modest (70ms) ping times. + */ + qsetlimit(s->rq, tcb->window); + qsetlimit(s->wq, tcb->window); + tcprcvwin(s); } --- /sys/src/9k/k10/etherif.h +++ /sys/src/9k/k10/etherif.h @@ -5,6 +5,11 @@ enum ETHERMAXTU = 1514, /* maximum transmit size */ ETHERHDRSIZE = 14, /* size of an ethernet header */ + /* ethernet packet types */ + ETARP = 0x0806, + ETIP4 = 0x0800, + ETIP6 = 0x86DD, + MaxEther = 48, Ntypes = 8, };